Commit f1ebcd3e authored by Fenglei's avatar Fenglei Committed by Robert Kimball

fix bugs in align_to_block_size function (#1191)

* extra *block_size

* change grid_size to threads
parent af956916
......@@ -1813,16 +1813,12 @@ __device__ __forceinline__ int64_t load(const int64_t* __restrict__ in, int i=
return ss.str();
}
uint32_t runtime::gpu::CUDAEmitter::align_to_block_size(uint32_t grid_size, uint32_t block_size)
uint32_t runtime::gpu::CUDAEmitter::align_to_block_size(uint32_t threads, uint32_t block_size)
{
if (grid_size > (1u << 31) - 1)
if (threads > (1u << 31) - 1)
{
throw std::runtime_error("Cuda can't handle grid_size_x > 2^31 - 1.");
}
uint32_t r = (grid_size + block_size - 1) / block_size * block_size;
if (grid_size > (1u << 31) - 1)
{
throw std::runtime_error("Cuda can't handle grid_size_x > 2^31 - 1.");
throw std::runtime_error("Cuda can't handle threads > 2^31 - 1.");
}
uint32_t r = (threads + block_size - 1) / block_size;
return r;
}
......@@ -151,7 +151,7 @@ namespace ngraph
private:
CUDAEmitter(GPUPrimitiveEmitter* emitter);
uint32_t align_to_block_size(uint32_t grid_size, uint32_t block_size);
uint32_t align_to_block_size(uint32_t threads, uint32_t block_size);
void print_tensor_from_gpu(codegen::CodeWriter& writer,
const std::string& tensor_name,
GPUShape shape);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment