pass args instead of pointer to array (#1591)

68eb2e7d · Fenglei · Robert Kimball · 309bfdf0 · 68eb2e7d · 68eb2e7d
Commit 68eb2e7d authored Sep 13, 2018 by Fenglei Committed by Robert Kimball Sep 13, 2018
3 changed files
--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
@@ -588,19 +588,6 @@ size_t runtime::gpu::CUDAEmitter::build_reshape(const std::array<std::string, 2>
        return primitive_index;
    }
-    // check if the kernel has already been compiled. if so, create
-    // a launch primitive for it based on the input tensor shape
-    // but do not recompile the kernel. otherwise, do it all:
-    // recompile the kernel and then create the primitive
-    auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
-    if (compiled_kernel == nullptr)
-    {
-        codegen::CodeWriter writer;
-        CudaKernelBuilder::add_pod_typedefs(writer);
-        CudaKernelBuilder::get_reshape_op(writer, kernel_name.str(), dtypes, rank);
-        compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
-    }
    uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape));
    // TODO: currently we set it to 64, will add tuning method later
    uint32_t block_size_x = 64;
@@ -620,33 +607,46 @@ size_t runtime::gpu::CUDAEmitter::build_reshape(const std::array<std::string, 2>
    }
    // get an allocator for transient per kernel gpu memory
-    GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
+    auto args = m_primitive_emitter->add_kernel_args();
-    size_t idx_input_strides =
+    args.add_placeholder(dtypes[0], "in")
-        allocator.reserve_argspace(input_strides.data(), input_strides.size() * sizeof(uint32_t));
+        .add_placeholder(dtypes[1], "out")
-    size_t idx_trans_strides =
+        .add("input_strides", input_strides)
-        allocator.reserve_argspace(trans_strides.data(), trans_strides.size() * sizeof(uint32_t));
+        .add("trans_strides", trans_strides)
+        .add("n", nthreads);
+    // check if the kernel has already been compiled. if so, create
+    // a launch primitive for it based on the input tensor shape
+    // but do not recompile the kernel. otherwise, do it all:
+    // recompile the kernel and then create the primitive
+    auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
+    if (compiled_kernel == nullptr)
+    {
+        codegen::CodeWriter writer;
+        CudaKernelBuilder::add_pod_typedefs(writer);
+        CudaKernelBuilder::get_reshape_op(writer, kernel_name.str(), args, dtypes, rank);
+        compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
+    }
    // create the launch primitive
-    std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs,
+    std::unique_ptr<gpu::primitive> kernel_launch(
-                                                                         void** outputs) mutable {
+        new gpu::primitive{[=](void** inputs, void** outputs) mutable {
-        void* param_input_strides = runtime::gpu::invoke_memory_primitive(m_ctx, idx_input_strides);
+            void** args_list = args.resolve_placeholder(0, &inputs[0])
-        void* param_trans_strides = runtime::gpu::invoke_memory_primitive(m_ctx, idx_trans_strides);
+                                   .resolve_placeholder(1, &outputs[0])
-        std::vector<void*> args_list{
+                                   .get_argument_list();
-            &inputs[0], &outputs[0], &param_input_strides, &param_trans_strides, &nthreads};
-        CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
+            CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
-                                      aligned_grid_size_x,
+                                          aligned_grid_size_x,
-                                      1,
+                                          1,
-                                      1, // grid dim
+                                          1, // grid dim
-                                      block_size_x,
+                                          block_size_x,
-                                      1,
+                                          1,
-                                      1, // block dim
+                                          1, // block dim
-                                      0,
+                                          0,
-                                      NULL, // shared mem and stream
+                                          NULL, // shared mem and stream
-                                      args_list.data(),
+                                          args_list,
-                                      0)); // arguments
+                                          0)); // arguments
-        debug_sync();
+            debug_sync();
-    }});
+        }});
    primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
    m_primitive_emitter->cache(hash, primitive_index);

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
@@ -452,12 +452,11 @@ void runtime::gpu::CudaKernelBuilder::get_onehot_op(codegen::CodeWriter& writer,
 void runtime::gpu::CudaKernelBuilder::get_reshape_op(codegen::CodeWriter& writer,
                                                     const std::string& name,
+                                                     runtime::gpu::GPUKernelArgs& args,
                                                     const std::array<std::string, 2>& data_types,
                                                     size_t rank)
 {
-    writer << "extern \"C\" __global__ void cuda_" << name << "(" << data_types[0] << "* in, "
+    writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
-           << data_types[1]
-           << "* out, uint32_t* input_strides, uint32_t* trans_strides, uint32_t n)\n";
    writer.block_begin();
    {
        writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
@@ -469,12 +468,12 @@ void runtime::gpu::CudaKernelBuilder::get_reshape_op(codegen::CodeWriter& writer
            size_t i = 0;
            for (; i < rank - 1; i++)
            {
-                writer << "output_idx += (input_idx / input_strides[" << i << "]) * trans_strides["
+                writer << "output_idx += (input_idx / input_strides" << i << ") * trans_strides"
-                       << i << "];\n";
+                       << i << ";\n";
-                writer << "input_idx %= input_strides[" << i << "];\n";
+                writer << "input_idx %= input_strides" << i << ";\n";
            }
-            writer << "output_idx += (input_idx / input_strides[" << i << "]) * trans_strides[" << i
+            writer << "output_idx += (input_idx / input_strides" << i << ") * trans_strides" << i
-                   << "];\n";
+                   << ";\n";
            writer << "out[output_idx] = in[tid];\n";
        }
        writer.block_end();

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
@@ -56,6 +56,7 @@ namespace ngraph
                static void get_reshape_op(codegen::CodeWriter& writer,
                                           const std::string& name,
+                                           runtime::gpu::GPUKernelArgs& args,
                                           const std::array<std::string, 2>& data_types,
                                           size_t rank);