nvgpu reduction optimization (#1455)

* add cuda reduce * clang format * fix bugs * fix bug * add 1d reduce * clang format * fix bugs * unroll loop * remove debug info * revert tests * unroll 1D reduce op * add comments * using cudnn for nd to scalar reduction * remove cuda 1d reduction since cudnn version is faster * remove 1D kernel * fix variable name * resolve Chris's comments * non_reduce_in_strides to non_reduce_strides

nvgpu reduction optimization (#1455)
* add cuda reduce * clang format * fix bugs * fix bug * add 1d reduce * clang format * fix bugs * unroll loop * remove debug info * revert tests * unroll 1D reduce op * add comments * using cudnn for nd to scalar reduction * remove cuda 1d reduction since cudnn version is faster * remove 1D kernel * fix variable name * resolve Chris's comments * non_reduce_in_strides to non_reduce_strides
6679c233 · Fenglei · Scott Cyphers · 2655f5e0 · 6679c233 · 6679c233
Commit 6679c233 authored Aug 24, 2018 by Fenglei Committed by Scott Cyphers Aug 24, 2018
5 changed files
--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
@@ -1323,6 +1323,111 @@ size_t runtime::gpu::CUDAEmitter::build_softmax_divide(const std::vector<std::st
    return primitive_index;
 }

+size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& dtypes,
+                                               NVShape input_shape,
+                                               NVShape reduce_axis,
+                                               const char* op,
+                                               const char* kernel)
+{
+    // assumes NC{d1,...,dn} format
+    std::string kernel_name = "reduce_" + join(dtypes, "_") + "_ri_" +
+                              std::to_string(input_shape.size()) + "_rr_" +
+                              std::to_string(reduce_axis.size());
+    std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');
+
+    std::stringstream ss;
+    ss << kernel_name << "_s_" << join(input_shape, "_") << "_axis_" << join(reduce_axis, "_");
+    auto hash = ss.str();
+    // check if the requested kernel is already an inserted primitive
+    size_t primitive_index = m_primitive_emitter->lookup(hash);
+    if (primitive_index != std::numeric_limits<size_t>::max())
+    {
+        return primitive_index;
+    }
+
+    size_t rank = input_shape.size();
+    size_t reduce_rank = reduce_axis.size();
+    size_t out_rank = rank - reduce_rank;
+    NVShape reduce_flag(rank, 0);
+    for (auto a : reduce_axis)
+    {
+        reduce_flag[a] = 1;
+    }
+    NVShape output_shape;
+    NVShape non_reduce_strides;
+    NVShape reduce_shape;
+    NVShape reduce_strides;
+    NVShape input_strides = row_major_strides(input_shape);
+    for (int i = 0; i < rank; i++)
+    {
+        if (reduce_flag[i] != 0)
+        {
+            reduce_shape.push_back(input_shape[i]);
+            reduce_strides.push_back(input_strides[i]);
+        }
+        else
+        {
+            non_reduce_strides.push_back(input_strides[i]);
+            output_shape.push_back(input_shape[i]);
+        }
+    }
+    NVShape output_strides = row_major_strides(output_shape);
+
+    uint32_t nthreads = static_cast<uint32_t>(shape_size(output_shape));
+    //TODO: currently we set it to 64, will add tuning method later
+    uint32_t block_size_x = 64;
+    uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x);
+    auto args = m_primitive_emitter->add_kernel_args();
+    args.add_placeholder(dtypes[0], "in")
+        .add_placeholder(dtypes[1], "out")
+        .add("out_strides", output_strides)
+        .add("non_reduce_strides", non_reduce_strides)
+        .add("reduce_shape", reduce_shape)
+        .add("reduce_strides", reduce_strides)
+        .add("nthreads", nthreads);
+
+    // if the kernel has not been compiled, build it
+    auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name);
+    if (compiled_kernel == nullptr)
+    {
+        codegen::CodeWriter writer;
+        CudaKernelBuilder::add_pod_typedefs(writer);
+        writer << include_helpers();
+        if (kernel)
+        {
+            CudaKernelBuilder::get_device_helper(
+                writer, op, kernel, {{dtypes[0], dtypes[0], dtypes[1]}});
+        }
+        runtime::gpu::CudaKernelBuilder::get_reduce_op(
+            writer, kernel_name, args, dtypes, op, out_rank, reduce_rank);
+        compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
+    }
+
+    std::unique_ptr<gpu::primitive> reduce(
+        new gpu::primitive{[=](void** inputs, void** outputs) mutable {
+            void** args_list = args.resolve_placeholder(0, &inputs[0])
+                                   .resolve_placeholder(1, &outputs[0])
+                                   .get_argument_list();
+
+            CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
+                                          aligned_grid_size_x,
+                                          1,
+                                          1,
+                                          block_size_x,
+                                          1,
+                                          1,
+                                          0,
+                                          NULL,
+                                          args_list,
+                                          0));
+            debug_sync();
+        }});
+
+    primitive_index = this->m_primitive_emitter->insert(std::move(reduce));
+    m_primitive_emitter->cache(hash, primitive_index);
+    return primitive_index;
+}
+
 size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Softmax* node)
 {
    auto& args = node->get_inputs();

--- a/src/ngraph/runtime/gpu/cuda_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.hpp
@@ -115,6 +115,18 @@ namespace ngraph
                        dtypes, tensor_shape, CudaOpMap<T>::op, CudaOpMap<T>::math_kernel);
                }

+                template <typename T>
+                size_t build_reduce(const std::vector<std::string>& dtypes,
+                                    NVShape tensor_shape,
+                                    NVShape reduce_axis)
+                {
+                    return build_reduce(dtypes,
+                                        tensor_shape,
+                                        reduce_axis,
+                                        CudaOpMap<T>::op,
+                                        CudaOpMap<T>::math_kernel);
+                }
+
                template <typename ELEMENTWISE_OP_TYPE, typename REDUCE_OP_TYPE = ngraph::op::Nop>
                size_t build_elementwise_collective(const std::vector<std::string>& dtypes,
                                                    NVShape tensor_shape,
@@ -181,7 +193,11 @@ namespace ngraph
                                                    const char* kernel,
                                                    const char* reduce_op,
                                                    bool save_elementwise);
-
+                size_t build_reduce(const std::vector<std::string>& dtypes,
+                                    NVShape tensor_shape,
+                                    NVShape reduce_axis,
+                                    const char* op,
+                                    const char* kernel);
                GPUPrimitiveEmitter* m_primitive_emitter;
                GPURuntimeContext* m_ctx;
            };

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
@@ -181,6 +181,85 @@ void runtime::gpu::CudaKernelBuilder::get_ew_collective_op(
    return;
 }

+//each thread calculate the whole reduction of one output
+void runtime::gpu::CudaKernelBuilder::get_reduce_op(codegen::CodeWriter& writer,
+                                                    const std::string& name,
+                                                    runtime::gpu::GPUKernelArgs& args,
+                                                    const std::vector<std::string>& data_types,
+                                                    const std::string& reduce_op,
+                                                    size_t out_rank,
+                                                    size_t reduce_rank)
+{
+    writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
+    writer.block_begin();
+    {
+        writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x; \n";
+        writer << "if (tid < nthreads)\n";
+        writer.block_begin();
+        {
+            if (out_rank > 0)
+            {
+                writer << "uint32_t dim_idx_generator = tid;\n";
+            }
+            writer << "uint32_t in_idx = 0;\n";
+            writer << data_types[1] << " r = 0;\n";
+
+            //loop through all reduction axis
+            for (int64_t i = 0; i < static_cast<int64_t>(out_rank); i++)
+            {
+                writer << "in_idx += (dim_idx_generator / out_strides" << i
+                       << ") * non_reduce_strides" << i << ";\n";
+                writer << "dim_idx_generator %= out_strides" << i << ";\n";
+            }
+            int64_t last_r_idx = static_cast<int64_t>(reduce_rank) - 1;
+            for (int64_t j = 0; j < last_r_idx; j++)
+            {
+                writer << "for(int idx" << j << " = 0; idx" << j << "< reduce_shape" << j << "; idx"
+                       << j << "++)\n";
+                writer.block_begin();
+            }
+            {
+                writer << "uint32_t reduce_idx = in_idx;\n";
+                for (int64_t j = 0; j < last_r_idx; j++)
+                {
+                    writer << "reduce_idx += idx" << j << " * reduce_strides" << j << ";\n";
+                }
+                writer << "int idx" << last_r_idx << " = 0;\n";
+                writer << "uint32_t step = reduce_strides" << last_r_idx << ";\n";
+                //unroll last reduction axis
+                writer << "for(; idx" << last_r_idx << " < (reduce_shape" << last_r_idx
+                       << " >> 3); idx" << last_r_idx << "++)\n";
+                writer.block_begin();
+                {
+                    for (int k = 0; k < 8; k++)
+                    {
+                        writer << "r = " << reduce_op << "(r , in[reduce_idx]);\n";
+                        writer << "reduce_idx += step;\n";
+                    }
+                }
+                writer.block_end();
+                writer << "idx" << last_r_idx << " <<= 3;\n";
+                writer << "for(; idx" << last_r_idx << " < reduce_shape" << last_r_idx << "; idx"
+                       << last_r_idx << "++)\n";
+                writer.block_begin();
+                {
+                    writer << "r = " << reduce_op << "(r , in[reduce_idx]);\n";
+                    writer << "reduce_idx += step;\n";
+                }
+                writer.block_end();
+            }
+            for (int64_t j = 0; j < last_r_idx; j++)
+            {
+                writer.block_end();
+            }
+            writer << "out[tid] = r;\n";
+        }
+        writer.block_end();
+    }
+    writer.block_end();
+    return;
+}
+
 void runtime::gpu::CudaKernelBuilder::get_broadcast_op(codegen::CodeWriter& writer,
                                                       const std::string& name,
                                                       runtime::gpu::GPUKernelArgs& args,

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
@@ -59,6 +59,14 @@ namespace ngraph
                                           const std::array<std::string, 2>& data_types,
                                           size_t rank);

+                static void get_reduce_op(codegen::CodeWriter& writer,
+                                          const std::string& name,
+                                          runtime::gpu::GPUKernelArgs& args,
+                                          const std::vector<std::string>& data_types,
+                                          const std::string& reduce_op,
+                                          size_t out_rank,
+                                          size_t reduce_rank);
+
                static void get_slice_op(codegen::CodeWriter& writer,
                                         const std::string& name,
                                         const std::array<std::string, 2>& data_types,

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -851,8 +851,7 @@ namespace ngraph
                        {
                            kernel::emit_memcpyDtD(writer, out[0], args[0]);
                        }
-                        // descriptors for tensors  with <= 4 dimensions
-                        else
+                        else if (out[0].get_shape().size() == 0)
                        {
                            auto& cudnn_emitter =
                                external_function->get_primitive_emitter()->get_cudnn_emitter();
@@ -867,6 +866,27 @@ namespace ngraph
                            writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
                            writer << ");\n";
                        }
+                        else
+                        {
+                            auto axes_set = sum->get_reduction_axes();
+                            ngraph::AxisVector axes_vec;
+                            for (auto a : axes_set)
+                            {
+                                axes_vec.push_back(a);
+                            }
+                            std::vector<std::string> dtypes;
+                            dtypes.push_back(args[0].get_type());
+                            dtypes.push_back(out[0].get_type());
+                            auto& cuda_emitter =
+                                external_function->get_primitive_emitter()->get_cuda_emitter();
+                            auto sum_index = cuda_emitter->build_reduce<ngraph::op::Add>(
+                                dtypes, args[0].get_shape(), axes_vec);
+
+                            writer << "gpu::invoke_primitive(ctx, " << sum_index << ", ";
+                            writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
+                            writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
+                            writer << ");\n";
+                        }
                    }
                }
                writer.block_end();