Commit f1ebcd3e authored by Fenglei's avatar Fenglei Committed by Robert Kimball

fix bugs in align_to_block_size function (#1191)

* extra *block_size

* change grid_size to threads
parent af956916
...@@ -1813,16 +1813,12 @@ __device__ __forceinline__ int64_t load(const int64_t* __restrict__ in, int i= ...@@ -1813,16 +1813,12 @@ __device__ __forceinline__ int64_t load(const int64_t* __restrict__ in, int i=
return ss.str(); return ss.str();
} }
uint32_t runtime::gpu::CUDAEmitter::align_to_block_size(uint32_t grid_size, uint32_t block_size) uint32_t runtime::gpu::CUDAEmitter::align_to_block_size(uint32_t threads, uint32_t block_size)
{ {
if (grid_size > (1u << 31) - 1) if (threads > (1u << 31) - 1)
{ {
throw std::runtime_error("Cuda can't handle grid_size_x > 2^31 - 1."); throw std::runtime_error("Cuda can't handle threads > 2^31 - 1.");
}
uint32_t r = (grid_size + block_size - 1) / block_size * block_size;
if (grid_size > (1u << 31) - 1)
{
throw std::runtime_error("Cuda can't handle grid_size_x > 2^31 - 1.");
} }
uint32_t r = (threads + block_size - 1) / block_size;
return r; return r;
} }
...@@ -151,7 +151,7 @@ namespace ngraph ...@@ -151,7 +151,7 @@ namespace ngraph
private: private:
CUDAEmitter(GPUPrimitiveEmitter* emitter); CUDAEmitter(GPUPrimitiveEmitter* emitter);
uint32_t align_to_block_size(uint32_t grid_size, uint32_t block_size); uint32_t align_to_block_size(uint32_t threads, uint32_t block_size);
void print_tensor_from_gpu(codegen::CodeWriter& writer, void print_tensor_from_gpu(codegen::CodeWriter& writer,
const std::string& tensor_name, const std::string& tensor_name,
GPUShape shape); GPUShape shape);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment