fix bugs in align_to_block_size function (#1191)

* extra *block_size * change grid_size to threads

fix bugs in align_to_block_size function (#1191)
* extra *block_size * change grid_size to threads
f1ebcd3e · Fenglei · Robert Kimball · af956916 · f1ebcd3e · f1ebcd3e
Commit f1ebcd3e authored Jul 05, 2018 by Fenglei Committed by Robert Kimball Jul 05, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 9 deletions

cuda_emitter.cpp src/ngraph/runtime/gpu/cuda_emitter.cpp +4 -8

cuda_emitter.hpp src/ngraph/runtime/gpu/cuda_emitter.hpp +1 -1

No files found.
--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
@@ -1813,16 +1813,12 @@ __device__ __forceinline__ int64_t  load(const int64_t*  __restrict__ in, int i=
    return ss.str();
 }
-uint32_t runtime::gpu::CUDAEmitter::align_to_block_size(uint32_t grid_size, uint32_t block_size)
+uint32_t runtime::gpu::CUDAEmitter::align_to_block_size(uint32_t threads, uint32_t block_size)
 {
-    if (grid_size > (1u << 31) - 1)
+    if (threads > (1u << 31) - 1)
    {
-        throw std::runtime_error("Cuda can't handle grid_size_x > 2^31 - 1.");
+        throw std::runtime_error("Cuda can't handle threads > 2^31 - 1.");
-    }
-    uint32_t r = (grid_size + block_size - 1) / block_size * block_size;
-    if (grid_size > (1u << 31) - 1)
-    {
-        throw std::runtime_error("Cuda can't handle grid_size_x > 2^31 - 1.");
    }
+    uint32_t r = (threads + block_size - 1) / block_size;
    return r;
 }
--- a/src/ngraph/runtime/gpu/cuda_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.hpp
@@ -151,7 +151,7 @@ namespace ngraph
            private:
                CUDAEmitter(GPUPrimitiveEmitter* emitter);
-                uint32_t align_to_block_size(uint32_t grid_size, uint32_t block_size);
+                uint32_t align_to_block_size(uint32_t threads, uint32_t block_size);
                void print_tensor_from_gpu(codegen::CodeWriter& writer,
                                           const std::string& tensor_name,
                                           GPUShape shape);