nvgpu reduce to scalar optimization (#1491)

* add cuda reduce * clang format * fix bugs * fix bug * add 1d reduce * clang format * fix bugs * unroll loop * remove debug info * revert tests * unroll 1D reduce op * add comments * using cudnn for nd to scalar reduction * remove cuda 1d reduction since cudnn version is faster * remove 1D kernel * fix bugs * 1d multi block size * remove debug * change kernel name * add reduce to scalar optimization, add test * fix bugs and tune parameters * clang format * update comments * update comments * update comments * clang format * update comments * remove wrong comments, apply clang format * resolve Bob's comment * clang format * pass shared mem size from cuLaunchKernel, set unroll loop size through host code * remove unused code.clang format * change reduce to thread with shfl for each warp first * add seed * unroll size

nvgpu reduce to scalar optimization (#1491)
* add cuda reduce * clang format * fix bugs * fix bug * add 1d reduce * clang format * fix bugs * unroll loop * remove debug info * revert tests * unroll 1D reduce op * add comments * using cudnn for nd to scalar reduction * remove cuda 1d reduction since cudnn version is faster * remove 1D kernel * fix bugs * 1d multi block size * remove debug * change kernel name * add reduce to scalar optimization, add test * fix bugs and tune parameters * clang format * update comments * update comments * update comments * clang format * update comments * remove wrong comments, apply clang format * resolve Bob's comment * clang format * pass shared mem size from cuLaunchKernel, set unroll loop size through host code * remove unused code.clang format * change reduce to thread with shfl for each warp first * add seed * unroll size
5f40d957 · Fenglei · Scott Cyphers · 8fdefa52 · 5f40d957 · 5f40d957
Commit 5f40d957 authored Sep 04, 2018 by Fenglei Committed by Scott Cyphers Sep 04, 2018
6 changed files
--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
@@ -1323,16 +1323,19 @@ size_t runtime::gpu::CUDAEmitter::build_softmax_divide(const std::vector<std::st
    return primitive_index;
 }

-size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& dtypes,
+size_t runtime::gpu::CUDAEmitter::build_reduce_to_nd(const std::vector<std::string>& dtypes,
                                                     NVShape input_shape,
                                                     NVShape reduce_axis,
                                                     const char* op,
                                                     const char* kernel)
 {
+    size_t rank = input_shape.size();
+    size_t reduce_rank = reduce_axis.size();
+    size_t out_rank = rank - reduce_rank;
    // assumes NC{d1,...,dn} format
-    std::string kernel_name = "reduce_" + join(dtypes, "_") + "_ri_" +
-                              std::to_string(input_shape.size()) + "_rr_" +
-                              std::to_string(reduce_axis.size());
+    std::string kernel_name = "reduce_nd_" + join(dtypes, "_");
+    kernel_name +=
+        "_ri_" + std::to_string(input_shape.size()) + "_rr_" + std::to_string(reduce_axis.size());
    std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');

    std::stringstream ss;
@@ -1345,9 +1348,6 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& d
        return primitive_index;
    }

-    size_t rank = input_shape.size();
-    size_t reduce_rank = reduce_axis.size();
-    size_t out_rank = rank - reduce_rank;
    NVShape reduce_flag(rank, 0);
    for (auto a : reduce_axis)
    {
@@ -1372,7 +1372,6 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& d
        }
    }
    NVShape output_strides = row_major_strides(output_shape);
-
    uint32_t nthreads = static_cast<uint32_t>(shape_size(output_shape));
    // TODO: currently we set it to 64, will add tuning method later
    uint32_t block_size_x = 64;
@@ -1398,7 +1397,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& d
            CudaKernelBuilder::get_device_helper(
                writer, op, kernel, {{dtypes[0], dtypes[0], dtypes[1]}});
        }
-        runtime::gpu::CudaKernelBuilder::get_reduce_op(
+        runtime::gpu::CudaKernelBuilder::get_reduce_to_nd_op(
            writer, kernel_name, args, dtypes, op, out_rank, reduce_rank);
        compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
    }
@@ -1422,8 +1421,248 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& d
                                          0));
            debug_sync();
        }});
+    primitive_index = this->m_primitive_emitter->insert(std::move(reduce));
+    m_primitive_emitter->cache(hash, primitive_index);
+    return primitive_index;
+}
+
+size_t runtime::gpu::CUDAEmitter::build_reduce_to_scalar(const std::vector<std::string>& dtypes,
+                                                         const size_t data_bytes,
+                                                         NVShape input_shape,
+                                                         const char* op,
+                                                         const char* kernel)
+{
+    // assumes NC{d1,...,dn} format
+    std::string kernel_name = "reduce_scalar_" + join(dtypes, "_");
+    std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');
+
+    std::stringstream ss;
+    ss << kernel_name << "_s_" << join(input_shape, "_");
+    auto hash = ss.str();
+    // check if the requested kernel is already an inserted primitive
+    size_t primitive_index = m_primitive_emitter->lookup(hash);
+    if (primitive_index != std::numeric_limits<size_t>::max())
+    {
+        return primitive_index;
+    }
+
+    uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape));
+    uint32_t n = nthreads;
+    uint32_t block_size_x = 1;
+    while (n > 1)
+    {
+        block_size_x <<= 1;
+        n >>= 1;
+    }
+    block_size_x = fmin(512, block_size_x);
+    uint32_t shared_data_bytes = block_size_x * static_cast<uint32_t>(data_bytes);
+    kernel_name += "_b_" + std::to_string(block_size_x);
+    auto args = m_primitive_emitter->add_kernel_args();
+    args.add_placeholder(dtypes[0], "in")
+        .add_placeholder(dtypes[1], "out")
+        .add("nthreads", nthreads);
+
+    // if the kernel has not been compiled, build it
+    auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name);
+    if (compiled_kernel == nullptr)
+    {
+        codegen::CodeWriter writer;
+        CudaKernelBuilder::add_pod_typedefs(writer);
+        writer << include_helpers();
+        if (kernel)
+        {
+            CudaKernelBuilder::get_device_helper(
+                writer, op, kernel, {{dtypes[0], dtypes[0], dtypes[1]}});
+        }
+        runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_op(
+            writer, kernel_name, args, dtypes, op, block_size_x);
+        compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
+    }
+
+    std::unique_ptr<gpu::primitive> reduce(
+        new gpu::primitive{[=](void** inputs, void** outputs) mutable {
+            void** args_list = args.resolve_placeholder(0, &inputs[0])
+                                   .resolve_placeholder(1, &outputs[0])
+                                   .get_argument_list();
+
+            CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
+                                          1,
+                                          1,
+                                          1,
+                                          block_size_x,
+                                          1,
+                                          1,
+                                          shared_data_bytes,
+                                          NULL,
+                                          args_list,
+                                          0));
+            debug_sync();
+        }});
+    primitive_index = this->m_primitive_emitter->insert(std::move(reduce));
+    m_primitive_emitter->cache(hash, primitive_index);
+    return primitive_index;
+}
+
+size_t runtime::gpu::CUDAEmitter::build_reduce_to_scalar_acc(const std::vector<std::string>& dtypes,
+                                                             NVShape input_shape,
+                                                             NVShape output_shape,
+                                                             uint32_t block_size_x,
+                                                             const char* op,
+                                                             const char* kernel)
+{
+    // assumes NC{d1,...,dn} format
+    std::string kernel_name = "reduce_acc_" + join(dtypes, "_");
+    std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');
+
+    std::stringstream ss;
+    ss << kernel_name << "_s_" << join(input_shape, "_");
+    auto hash = ss.str();
+    // check if the requested kernel is already an inserted primitive
+    size_t primitive_index = m_primitive_emitter->lookup(hash);
+    if (primitive_index != std::numeric_limits<size_t>::max())
+    {
+        return primitive_index;
+    }
+
+    uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape));
+    auto args = m_primitive_emitter->add_kernel_args();
+    args.add_placeholder(dtypes[0], "in")
+        .add_placeholder(dtypes[1], "out")
+        .add("nthreads", nthreads);
+
+    uint32_t aligned_grid_size_x = static_cast<uint32_t>(shape_size(output_shape)) / block_size_x;
+
+    auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name);
+    // if the kernel has not been compiled, build it
+    if (compiled_kernel == nullptr)
+    {
+        codegen::CodeWriter writer;
+        CudaKernelBuilder::add_pod_typedefs(writer);
+        writer << include_helpers();
+        if (kernel)
+        {
+            CudaKernelBuilder::get_device_helper(
+                writer, op, kernel, {{dtypes[0], dtypes[0], dtypes[1]}});
+        }
+        runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_acc_op(
+            writer, kernel_name, args, dtypes, op);
+        compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
+    }
+
+    std::unique_ptr<gpu::primitive> reduce_acc(
+        new gpu::primitive{[=](void** inputs, void** outputs) mutable {
+            void** args_list = args.resolve_placeholder(0, &inputs[0])
+                                   .resolve_placeholder(1, &outputs[0])
+                                   .get_argument_list();
+            CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
+                                          aligned_grid_size_x,
+                                          1,
+                                          1,
+                                          block_size_x,
+                                          1,
+                                          1,
+                                          0,
+                                          NULL,
+                                          args_list,
+                                          0));
+        }});
+    primitive_index = this->m_primitive_emitter->insert(std::move(reduce_acc));
+
+    m_primitive_emitter->cache(hash, primitive_index);
+    return primitive_index;
+}
+
+size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& dtypes,
+                                               const size_t data_bytes,
+                                               NVShape input_shape,
+                                               NVShape reduce_axis,
+                                               const char* op,
+                                               const char* kernel)
+{
+    size_t rank = input_shape.size();
+    size_t reduce_rank = reduce_axis.size();
+    size_t out_rank = rank - reduce_rank;
+    // assumes NC{d1,...,dn} format
+    std::string kernel_name = "reduce_" + join(dtypes, "_");
+    if (out_rank != 0)
+    {
+        kernel_name += "_ri_" + std::to_string(input_shape.size()) + "_rr_" +
+                       std::to_string(reduce_axis.size());
+    }
+    std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');
+
+    std::stringstream ss;
+    ss << kernel_name << "_s_" << join(input_shape, "_") << "_axis_" << join(reduce_axis, "_");
+    auto hash = ss.str();
+    // check if the requested kernel is already an inserted primitive
+    size_t primitive_index = m_primitive_emitter->lookup(hash);
+    if (primitive_index != std::numeric_limits<size_t>::max())
+    {
+        return primitive_index;
+    }
+
+    int num_SMs;
+    CUDA_RT_SAFE_CALL(cudaDeviceGetAttribute(&num_SMs, cudaDevAttrMultiProcessorCount, 0));
+    uint32_t block_size_x_acc = 256;
+    uint32_t nthreads_acc = num_SMs * block_size_x_acc;
+    //call reduce_to_nd
+    if (out_rank != 0)
+    {
+        size_t reduce_idx = build_reduce_to_nd(dtypes, input_shape, reduce_axis, op, kernel);

+        std::unique_ptr<gpu::primitive> reduce(
+            new gpu::primitive{[=](void** inputs, void** outputs) mutable {
+                gpu::invoke_primitive(m_ctx,
+                                      reduce_idx,
+                                      std::vector<void*>{inputs[0]}.data(),
+                                      std::vector<void*>{outputs[0]}.data());
+            }});
        primitive_index = this->m_primitive_emitter->insert(std::move(reduce));
+    }
+    else
+    {
+        uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape));
+        //if the data size is large, call reduce_to_scalar_acc first and then reduce_to_scalar.
+        //other wise, call reduce to scalar directly.
+        const uint32_t unroll_size = 8;
+        if (nthreads > nthreads_acc * (unroll_size + 1))
+        {
+            NVShape acc_output_shape{nthreads_acc};
+            size_t reduce_scalar_acc_idx = build_reduce_to_scalar_acc(
+                dtypes, input_shape, acc_output_shape, block_size_x_acc, op, kernel);
+            size_t reduce_scalar_idx =
+                build_reduce_to_scalar(dtypes, data_bytes, acc_output_shape, op, kernel);
+            // get an allocator for transient per kernel gpu memory
+            GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
+            size_t idx_workspace = allocator.reserve_workspace(nthreads_acc * data_bytes);
+            std::unique_ptr<gpu::primitive> reduce_scalar_acc(
+                new gpu::primitive{[=](void** inputs, void** outputs) mutable {
+                    void* buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
+                    gpu::invoke_primitive(m_ctx,
+                                          reduce_scalar_acc_idx,
+                                          std::vector<void*>{inputs[0]}.data(),
+                                          std::vector<void*>{buffer}.data());
+                    gpu::invoke_primitive(m_ctx,
+                                          reduce_scalar_idx,
+                                          std::vector<void*>{buffer}.data(),
+                                          std::vector<void*>{outputs[0]}.data());
+                }});
+            primitive_index = this->m_primitive_emitter->insert(std::move(reduce_scalar_acc));
+        }
+        else
+        {
+            size_t reduce_scalar_idx =
+                build_reduce_to_scalar(dtypes, data_bytes, input_shape, op, kernel);
+            std::unique_ptr<gpu::primitive> reduce_scalar(
+                new gpu::primitive{[=](void** inputs, void** outputs) mutable {
+                    gpu::invoke_primitive(m_ctx,
+                                          reduce_scalar_idx,
+                                          std::vector<void*>{inputs[0]}.data(),
+                                          std::vector<void*>{outputs[0]}.data());
+                }});
+            primitive_index = this->m_primitive_emitter->insert(std::move(reduce_scalar));
+        }
+    }
    m_primitive_emitter->cache(hash, primitive_index);
    return primitive_index;
 }

--- a/src/ngraph/runtime/gpu/cuda_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.hpp
@@ -117,11 +117,13 @@ namespace ngraph

                template <typename T>
                size_t build_reduce(const std::vector<std::string>& dtypes,
-                                    NVShape tensor_shape,
+                                    const size_t data_bytes,
+                                    NVShape input_shape,
                                    NVShape reduce_axis)
                {
                    return build_reduce(dtypes,
-                                        tensor_shape,
+                                        data_bytes,
+                                        input_shape,
                                        reduce_axis,
                                        CudaOpMap<T>::op,
                                        CudaOpMap<T>::math_kernel);
@@ -194,10 +196,31 @@ namespace ngraph
                                                    const char* reduce_op,
                                                    bool save_elementwise);
                size_t build_reduce(const std::vector<std::string>& dtypes,
-                                    NVShape tensor_shape,
+                                    const size_t data_bytes,
+                                    NVShape input_shape,
+                                    NVShape reduce_axis,
+                                    const char* op,
+                                    const char* kernel);
+                size_t build_reduce_to_nd(const std::vector<std::string>& dtypes,
+                                          NVShape input_shape,
                                          NVShape reduce_axis,
                                          const char* op,
                                          const char* kernel);
+                size_t build_reduce_to_scalar(const std::vector<std::string>& dtypes,
+                                              const size_t data_bytes,
+                                              NVShape input_shape,
+                                              const char* op,
+                                              const char* kernel);
+
+                //This is the preprocess for reduce to scalar if the data size is large than a number.
+                //The number can be tuned based on hardware.
+                //This cuda kernel will accumulate reduction to a certain number of bins depends on hardware.
+                size_t build_reduce_to_scalar_acc(const std::vector<std::string>& dtypes,
+                                                  NVShape input_shape,
+                                                  NVShape output_shape,
+                                                  uint32_t block_size_x,
+                                                  const char* op,
+                                                  const char* kernel);
                GPUPrimitiveEmitter* m_primitive_emitter;
                GPURuntimeContext* m_ctx;
            };

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.cpp
@@ -182,7 +182,8 @@ void runtime::gpu::CudaKernelBuilder::get_ew_collective_op(
 }

 //each thread calculate the whole reduction of one output
-void runtime::gpu::CudaKernelBuilder::get_reduce_op(codegen::CodeWriter& writer,
+void runtime::gpu::CudaKernelBuilder::get_reduce_to_nd_op(
+    codegen::CodeWriter& writer,
    const std::string& name,
    runtime::gpu::GPUKernelArgs& args,
    const std::vector<std::string>& data_types,
@@ -227,18 +228,20 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_op(codegen::CodeWriter& writer,
                writer << "int idx" << last_r_idx << " = 0;\n";
                writer << "uint32_t step = reduce_strides" << last_r_idx << ";\n";
                // unroll last reduction axis
-                writer << "for(; idx" << last_r_idx << " < (reduce_shape" << last_r_idx
-                       << " >> 3); idx" << last_r_idx << "++)\n";
+                uint32_t unroll_num = 8;
+                uint32_t unroll_shift = 3;
+                writer << "for(; idx" << last_r_idx << " < (reduce_shape" << last_r_idx << " >> "
+                       << unroll_shift << "); idx" << last_r_idx << "++)\n";
                writer.block_begin();
                {
-                    for (int k = 0; k < 8; k++)
+                    for (int k = 0; k < unroll_num; k++)
                    {
                        writer << "r = " << reduce_op << "(r , in[reduce_idx]);\n";
                        writer << "reduce_idx += step;\n";
                    }
                }
                writer.block_end();
-                writer << "idx" << last_r_idx << " <<= 3;\n";
+                writer << "idx" << last_r_idx << " <<= " << unroll_shift << ";\n";
                writer << "for(; idx" << last_r_idx << " < reduce_shape" << last_r_idx << "; idx"
                       << last_r_idx << "++)\n";
                writer.block_begin();
@@ -260,6 +263,144 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_op(codegen::CodeWriter& writer,
    return;
 }

+void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_op(
+    codegen::CodeWriter& writer,
+    const std::string& name,
+    runtime::gpu::GPUKernelArgs& args,
+    const std::vector<std::string>& data_types,
+    const std::string& reduce_op,
+    uint32_t block_size_x)
+{
+    writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
+    writer.block_begin();
+    {
+        writer << "extern __shared__ " << data_types[1] << " sdata[];\n";
+        writer << "uint32_t tid = threadIdx.x; \n";
+        writer << "uint32_t step = blockDim.x; \n";
+        writer << "sdata[tid] = 0;\n";
+        writer << "uint32_t in_idx = tid;\n";
+        writer << data_types[1] << " r = 0;\n";
+        writer << "if(in_idx < nthreads)\n";
+        writer.block_begin();
+        writer << "r = in[in_idx];\n";
+        writer << "in_idx += step;\n";
+        writer.block_end();
+        //accumulate reduction to blockDim.x threads
+        uint32_t unroll_num = 8;
+        writer << "while(in_idx + (step * " << unroll_num - 1 << ") < nthreads)\n";
+        writer.block_begin();
+        {
+            for (int i = 0; i < unroll_num; i++)
+            {
+                writer << "r = " << reduce_op << "(r , in[in_idx]);\n";
+                writer << "in_idx += step;\n";
+            }
+        }
+        writer.block_end();
+        writer << "while(in_idx < nthreads)\n";
+        writer.block_begin();
+        {
+            writer << "r = " << reduce_op << "(r , in[in_idx]);\n";
+            writer << "in_idx += step;\n";
+        }
+        writer.block_end();
+
+        //accumulate 32 threads for each warp
+        for (int i = 16; i >= 1; i >>= 1)
+        {
+            if (block_size_x > i)
+            {
+                writer << "r = " << reduce_op << "(r, __shfl_down_sync(0xffffffff, r, " << i
+                       << ", 32));\n";
+            }
+        }
+
+        if (block_size_x > 32)
+        {
+            writer << "uint32_t lane_idx = tid & 0x1f; \n";
+            writer << "uint32_t warp_idx = tid >> 5; \n";
+            writer << "if(lane_idx == 0)\n";
+            writer.block_begin();
+            {
+                writer << "sdata[warp_idx] = r;\n";
+            }
+            writer.block_end();
+            writer << "__syncthreads();\n";
+
+            uint32_t warp_size = block_size_x >> 5;
+
+            writer << "if(tid < " << warp_size << ")\n";
+            writer.block_begin();
+            {
+                writer << "r = sdata[tid];\n";
+            }
+            writer.block_end();
+            //accumulate 32 threads
+            for (int i = 16; i >= 1; i >>= 1)
+            {
+                if (warp_size > i)
+                {
+                    writer << "r = " << reduce_op << "(r, __shfl_down_sync(0xffffffff, r, " << i
+                           << ", 32));\n";
+                }
+            }
+        }
+
+        writer << "if(tid == 0)\n";
+        writer.block_begin();
+        {
+            writer << "out[0] = r;\n";
+        }
+        writer.block_end();
+    }
+    writer.block_end();
+    return;
+}
+
+void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_acc_op(
+    codegen::CodeWriter& writer,
+    const std::string& name,
+    runtime::gpu::GPUKernelArgs& args,
+    const std::vector<std::string>& data_types,
+    const std::string& reduce_op)
+{
+    writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
+    writer.block_begin();
+    {
+        writer << "uint32_t tid = blockDim.x*blockIdx.x + threadIdx.x;\n";
+        writer << "uint32_t step = gridDim.x * blockDim.x; \n";
+        writer << "uint32_t in_idx = tid;\n";
+        writer << data_types[1] << " r = 0;\n";
+        writer << "if(in_idx < nthreads)\n";
+        writer.block_begin();
+        writer << "r = in[in_idx];\n";
+        writer << "in_idx += step;\n";
+        writer.block_end();
+        //accumulate reduction to step threads
+        uint32_t unroll_num = 8;
+        writer << "while(in_idx + (step * " << unroll_num - 1 << ") < nthreads)\n";
+        writer.block_begin();
+        {
+            for (int i = 0; i < unroll_num; i++)
+            {
+                writer << "r = " << reduce_op << "(r , in[in_idx]);\n";
+                writer << "in_idx += step;\n";
+            }
+        }
+        writer.block_end();
+        writer << "while(in_idx < nthreads)\n";
+        writer.block_begin();
+        {
+            writer << "r = " << reduce_op << "(r , in[in_idx]);\n";
+            writer << "in_idx += step;\n";
+        }
+        writer.block_end();
+        writer << "out[tid] = r;\n";
+    }
+    writer.block_end();
+    return;
+}
+
 void runtime::gpu::CudaKernelBuilder::get_broadcast_op(codegen::CodeWriter& writer,
                                                       const std::string& name,
                                                       runtime::gpu::GPUKernelArgs& args,

--- a/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
+++ b/src/ngraph/runtime/gpu/gpu_cuda_kernel_builder.hpp
@@ -59,7 +59,7 @@ namespace ngraph
                                           const std::array<std::string, 2>& data_types,
                                           size_t rank);

-                static void get_reduce_op(codegen::CodeWriter& writer,
+                static void get_reduce_to_nd_op(codegen::CodeWriter& writer,
                                                const std::string& name,
                                                runtime::gpu::GPUKernelArgs& args,
                                                const std::vector<std::string>& data_types,
@@ -67,6 +67,23 @@ namespace ngraph
                                                size_t out_rank,
                                                size_t reduce_rank);

+                //using one block with at most 512 threads to reduce to scalar.
+                static void get_reduce_to_scalar_op(codegen::CodeWriter& writer,
+                                                    const std::string& name,
+                                                    runtime::gpu::GPUKernelArgs& args,
+                                                    const std::vector<std::string>& data_types,
+                                                    const std::string& reduce_op,
+                                                    uint32_t block_size_x);
+
+                //This is the preprocess to reduce to scalar if the data size is large than a number.
+                //The number can be tuned based on hardware.
+                //This cuda kernel will accumulate reduction to a certain number of bins depends on hardware.
+                static void get_reduce_to_scalar_acc_op(codegen::CodeWriter& writer,
+                                                        const std::string& name,
+                                                        runtime::gpu::GPUKernelArgs& args,
+                                                        const std::vector<std::string>& data_types,
+                                                        const std::string& reduce_op);
+
                static void get_slice_op(codegen::CodeWriter& writer,
                                         const std::string& name,
                                         const std::array<std::string, 2>& data_types,

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -851,21 +851,6 @@ namespace ngraph
                        {
                            kernel::emit_memcpyDtD(writer, out[0], args[0]);
                        }
-                        else if (out[0].get_shape().size() == 0)
-                        {
-                            auto& cudnn_emitter =
-                                external_function->get_primitive_emitter()->get_cudnn_emitter();
-                            auto sum_index =
-                                cudnn_emitter->build_reduce_forward(CUDNN_REDUCE_TENSOR_ADD,
-                                                                    out[0].get_type(),
-                                                                    args[0].get_shape(),
-                                                                    sum->get_reduction_axes());
-
-                            writer << "gpu::invoke_primitive(ctx, " << sum_index << ", ";
-                            writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
-                            writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
-                            writer << ");\n";
-                        }
                        else
                        {
                            auto axes_set = sum->get_reduction_axes();
@@ -880,7 +865,10 @@ namespace ngraph
                            auto& cuda_emitter =
                                external_function->get_primitive_emitter()->get_cuda_emitter();
                            auto sum_index = cuda_emitter->build_reduce<ngraph::op::Add>(
-                                dtypes, args[0].get_shape(), axes_vec);
+                                dtypes,
+                                out[0].get_element_type().size(),
+                                args[0].get_shape(),
+                                axes_vec);

                            writer << "gpu::invoke_primitive(ctx, " << sum_index << ", ";
                            writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -18,8 +18,8 @@
 #include <cinttypes>
 #include <cmath>
 #include <cstdlib>
+#include <random>
 #include <string>
-
 #include "gtest/gtest.h"

 #include "ngraph/autodiff/adjoints.hpp"
@@ -37,6 +37,8 @@
 #include "util/test_control.hpp"
 #include "util/test_tools.hpp"

+static std::mt19937_64 random_generator;
+
 using namespace std;
 using namespace ngraph;

@@ -3502,6 +3504,33 @@ NGRAPH_TEST(${BACKEND_NAME}, sum_to_scalar)
    EXPECT_EQ((vector<float>{1, 2, 3, 4}), read_vector<float>(a));
 }

+NGRAPH_TEST(${BACKEND_NAME}, sum_large_1d_to_scalar)
+{
+    Shape shape{1000000};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{0}), op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    random_generator.seed(2);
+    vector<float> v_a(1000000, 0);
+    double r = 0;
+    for (int i = 0; i < 1000000; i++)
+    {
+        v_a[i] = static_cast<float>(random_generator() % 255);
+        r += static_cast<double>(v_a[i]);
+    }
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a, v_a);
+    auto result = backend->create_tensor(element::f32, Shape{});
+
+    backend->call_with_validate(f, {result}, {a});
+
+    EXPECT_TRUE(
+        test::all_close_f(vector<float>{static_cast<float>(r)}, read_vector<float>(result)));
+}
+
 NGRAPH_TEST(${BACKEND_NAME}, sum_matrix_columns)
 {
    Shape shape_a{3, 2};