gpu concat optimize (#1259)

* optimize concat * compile sucess * multi inputs * clang format

gpu concat optimize (#1259)
* optimize concat * compile sucess * multi inputs * clang format
38ba5c12 · Fenglei · Scott Cyphers · deacf29a · 38ba5c12 · 38ba5c12
Commit 38ba5c12 authored Jul 27, 2018 by Fenglei Committed by Scott Cyphers Jul 27, 2018
5 changed files
--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
@@ -69,6 +69,100 @@ runtime::gpu::CUDAEmitter::CUDAEmitter(runtime::gpu::GPUPrimitiveEmitter* emitte
    m_ctx = ctx;
 }

+size_t runtime::gpu::CUDAEmitter::build_concat(const std::vector<std::string>& dtypes,
+                                               std::vector<GPUShape> input_shapes,
+                                               size_t concat_axis,
+                                               GPUShape output_shape)
+{
+    std::stringstream kernel_name;
+    size_t input_size = input_shapes.size();
+    kernel_name << "concat_" << join(dtypes, "_") << "_r_" << input_size;
+
+    std::stringstream hash;
+    hash << kernel_name.str() << "_o_" << join(output_shape, "_") << "_a_" << concat_axis;
+    for (size_t i = 0; i < input_size; i++)
+    {
+        hash << "_i_" << join(input_shapes[i], "_");
+    }
+    // For backwards compatability we currently use two unordered maps
+    // 1. one looks up the compiled cuda kernel (CudaFunctionPool)
+    // 2. the other looks to see if this kernel is already in the primitive list
+
+    // check if the requested kernel is already an inserted primitive
+    size_t primitive_index = m_primitive_emitter->lookup(hash.str());
+    if (primitive_index != std::numeric_limits<size_t>::max())
+    {
+        return primitive_index;
+    }
+
+    // check if the kernel has already been compiled. if so, create
+    // a launch primitive for it based on the input tensor shape
+    // but do not recompile the kernel. otherwise, do it all:
+    // recompile the kernel and then create the primitive
+    auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name.str());
+    if (compiled_kernel == nullptr)
+    {
+        codegen::CodeWriter writer;
+        CudaKernelBuilder::add_pod_typedefs(writer);
+        CudaKernelBuilder::get_concat_op(writer, kernel_name.str(), dtypes, input_shapes.size());
+        compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name.str(), writer.get_code());
+    }
+
+    std::vector<uint32_t> block_strides(input_size, 1);
+    uint32_t block_size = 0;
+    for (size_t i = 0; i < input_size; i++)
+    {
+        auto arg_rank = input_shapes[i].size();
+        for (size_t j = concat_axis; j < arg_rank; j++)
+        {
+            block_strides[i] *= input_shapes[i][j];
+        }
+        block_size += block_strides[i];
+    }
+
+    uint32_t nthreads = static_cast<uint32_t>(shape_size(output_shape));
+    //TODO: currently we set it to 64, will add tuning method later
+    uint32_t block_size_x = 64;
+    uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x);
+
+    // get an allocator for transient per kernel gpu memory
+    GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
+    size_t idx_block_strides =
+        allocator.reserve_argspace(block_strides.data(), block_strides.size() * sizeof(uint32_t));
+
+    // create the launch primitive
+    std::unique_ptr<gpu::primitive> kernel_launch(new gpu::primitive{[=](void** inputs,
+                                                                         void** outputs) mutable {
+        void* param_block_strides = runtime::gpu::invoke_memory_primitive(m_ctx, idx_block_strides);
+        std::vector<void*> args_list;
+        for (size_t i = 0; i < input_size; i++)
+        {
+            args_list.push_back(&inputs[i]);
+        }
+        args_list.push_back(&outputs[0]);
+        args_list.push_back(&param_block_strides);
+        args_list.push_back(&block_size);
+        args_list.push_back(&nthreads);
+
+        CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
+                                      aligned_grid_size_x,
+                                      1,
+                                      1, // grid dim
+                                      block_size_x,
+                                      1,
+                                      1, // block dim
+                                      0,
+                                      NULL, // shared mem and stream
+                                      args_list.data(),
+                                      0)); // arguments
+        debug_sync();
+    }});
+
+    primitive_index = this->m_primitive_emitter->insert(std::move(kernel_launch));
+    m_primitive_emitter->cache(hash.str(), primitive_index);
+    return primitive_index;
+}
+
 size_t runtime::gpu::CUDAEmitter::build_pad(const std::array<std::string, 2>& dtypes,
                                            GPUShape input_shape,
                                            GPUShape output_shape,

--- a/src/ngraph/runtime/gpu/cuda_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.hpp
@@ -135,6 +135,11 @@ namespace ngraph
                                         GPUShape filter_dilation,
                                         GPUShape output_shape);

+                size_t build_concat(const std::vector<std::string>& dtypes,
+                                    std::vector<GPUShape> input_shapes,
+                                    size_t concat_axis,
+                                    GPUShape output_shape);
+
                void debug_sync();
                void sync();


--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
@@ -252,17 +252,17 @@ void runtime::gpu::CudaKernelBuilder::get_concat_op(codegen::CodeWriter& writer,
        writer << data_types[i] << "* in" << i << ", ";
    }
    writer << data_types[num_inputs]
-           << "* out, size_t* block_strides, size_t block_size, size_t n)\n";
+           << "* out, uint32_t* block_strides, uint32_t block_size, uint32_t n)\n";
    writer.block_begin();
    {
-        writer << "size_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
+        writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x;\n";
        writer << "if(tid < n)\n";
        writer.block_begin();
        {
            writer << "out[tid] = 1;\n";
-            writer << "size_t output_idx = tid;\n";
-            writer << "size_t block_id = tid / block_size;\n";
-            writer << "size_t block_idx = tid % block_size;\n";
+            writer << "uint32_t output_idx = tid;\n";
+            writer << "uint32_t block_id = tid / block_size;\n";
+            writer << "uint32_t block_idx = tid % block_size;\n";
            writer << "bool processed = false;\n";
            for (size_t i = 0; i < num_inputs; i++)
            {

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp
@@ -54,46 +54,6 @@ namespace ngraph
                              CUdeviceptr reverse_axes,
                              size_t rank,
                              size_t count);
-
-            template <typename... Inputs>
-            void emit_concat_op(const std::string& name,
-                                const std::vector<std::string>& data_types,
-                                GPURuntimeContext* ctx,
-                                size_t count,
-                                size_t block_size,
-                                CUdeviceptr block_strides,
-                                CUdeviceptr out,
-                                Inputs&&... inputs)
-            {
-                std::string type_signature = "_" + join(data_types, "_");
-                std::replace(type_signature.begin(), type_signature.end(), ' ', '_');
-                auto compiled_kernel = ctx->compiled_kernel_pool->get(name + type_signature);
-                if (compiled_kernel == nullptr)
-                {
-                    codegen::CodeWriter writer;
-                    CudaKernelBuilder::add_pod_typedefs(writer);
-
-                    CudaKernelBuilder::get_concat_op(
-                        writer, name + type_signature, data_types, sizeof...(inputs));
-
-                    std::string kernel = writer.get_code();
-                    compiled_kernel = ctx->compiled_kernel_pool->set(name + type_signature, kernel);
-                }
-
-                void* args_list[] = {&inputs..., &out, &block_strides, &block_size, &count};
-                CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
-                                              count,
-                                              1,
-                                              1, // grid dim
-                                              1,
-                                              1,
-                                              1, // block dim
-                                              0,
-                                              NULL, // shared mem and stream
-                                              args_list,
-                                              0));  // arguments
-                CUDA_SAFE_CALL(cuCtxSynchronize()); // Retrieve and print output.
-            }
        }
    }
 }
--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -824,46 +824,32 @@ namespace ngraph
                auto concat = static_cast<const ngraph::op::Concat*>(node);
                auto axis = concat->get_concatenation_axis();

-                std::vector<size_t> block_strides(args.size(), 1);
-                size_t block_size = 0;
-                for (size_t i = 0; i < args.size(); i++)
+                std::vector<std::string> dtypes;
+                std::vector<GPUShape> input_shapes;
+                for (auto arg : args)
                {
-                    auto arg_shape = args[i].get_shape();
-                    auto arg_rank = arg_shape.size();
-                    for (size_t j = axis; j < arg_rank; j++)
-                    {
-                        block_strides[i] *= arg_shape[j];
-                    }
-                    block_size += block_strides[i];
+                    dtypes.push_back(arg.get_type());
+                    input_shapes.push_back(arg.get_shape());
                }
+                dtypes.push_back(out[0].get_type());

                writer.block_begin();
-                writer << "int count = " << out[0].get_size() << ";\n";
-                writer << "int num_inputs = " << args.size() << ";\n";
-
-                GPUAllocator allocator =
-                    external_function->get_primitive_emitter()->get_memory_allocator();
-                size_t idx_block_strides = allocator.reserve_argspace(
-                    block_strides.data(), block_strides.size() * sizeof(size_t));
-                writer << "void* block_strides_d = runtime::gpu::invoke_memory_primitive(ctx, "
-                       << idx_block_strides << ");\n";
-
-                writer << "ngraph::runtime::gpu::emit_concat_op(\"" << node->description() << "\""
-                       << ", std::vector<std::string>{";
-                for (size_t i = 0; i < args.size(); i++)
-                {
-                    writer << "\"" << args[i].get_type() << "\", ";
-                }
-                writer << "\"" << out[0].get_type() << "\"}"
-                       << ", ctx"
-                       << ", count"
-                       << ", " << block_size << ", CUdeviceptr(block_strides_d)"
-                       << ", CUdeviceptr(" << out[0].get_name() << ")";
-                for (size_t i = 0; i < args.size(); i++)
                {
-                    writer << ", CUdeviceptr(" << args[i].get_name() << ")";
+                    auto& cuda_emitter =
+                        external_function->get_primitive_emitter()->get_cuda_emitter();
+                    auto index =
+                        cuda_emitter->build_concat(dtypes, input_shapes, axis, out[0].get_shape());
+
+                    writer << "gpu::invoke_primitive(ctx, " << index << ", ";
+                    writer << "std::vector<void*>{" << args[0].get_name();
+                    for (size_t i = 1; i < args.size(); i++)
+                    {
+                        writer << ", " << args[i].get_name();
+                    }
+                    writer << "}.data(), ";
+                    writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
+                    writer << ");\n";
                }
-                writer << ");\n";
                writer.block_end();
            }