Commit 5f40d957 authored by Fenglei's avatar Fenglei Committed by Scott Cyphers

nvgpu reduce to scalar optimization (#1491)

* add cuda reduce

* clang format

* fix bugs

* fix bug

* add 1d reduce

* clang format

* fix bugs

* unroll loop

* remove debug info

* revert tests

* unroll 1D reduce op

* add comments

* using cudnn for nd to scalar reduction

* remove cuda 1d reduction since cudnn version is faster

* remove 1D kernel

* fix bugs

* 1d multi block size

* remove debug

* change kernel name

* add reduce to scalar optimization, add test

* fix bugs and tune parameters

* clang format

* update comments

* update comments

* update comments

* clang format

* update comments

* remove wrong comments, apply clang format

* resolve Bob's comment

* clang format

* pass shared mem size from cuLaunchKernel, set unroll loop size through host code

* remove unused code.clang format

* change reduce to thread with shfl for each warp first

* add seed

* unroll size
parent 8fdefa52
This diff is collapsed.
......@@ -117,11 +117,13 @@ namespace ngraph
template <typename T>
size_t build_reduce(const std::vector<std::string>& dtypes,
NVShape tensor_shape,
const size_t data_bytes,
NVShape input_shape,
NVShape reduce_axis)
{
return build_reduce(dtypes,
tensor_shape,
data_bytes,
input_shape,
reduce_axis,
CudaOpMap<T>::op,
CudaOpMap<T>::math_kernel);
......@@ -194,10 +196,31 @@ namespace ngraph
const char* reduce_op,
bool save_elementwise);
size_t build_reduce(const std::vector<std::string>& dtypes,
NVShape tensor_shape,
const size_t data_bytes,
NVShape input_shape,
NVShape reduce_axis,
const char* op,
const char* kernel);
size_t build_reduce_to_nd(const std::vector<std::string>& dtypes,
NVShape input_shape,
NVShape reduce_axis,
const char* op,
const char* kernel);
size_t build_reduce_to_scalar(const std::vector<std::string>& dtypes,
const size_t data_bytes,
NVShape input_shape,
const char* op,
const char* kernel);
//This is the preprocess for reduce to scalar if the data size is large than a number.
//The number can be tuned based on hardware.
//This cuda kernel will accumulate reduction to a certain number of bins depends on hardware.
size_t build_reduce_to_scalar_acc(const std::vector<std::string>& dtypes,
NVShape input_shape,
NVShape output_shape,
uint32_t block_size_x,
const char* op,
const char* kernel);
GPUPrimitiveEmitter* m_primitive_emitter;
GPURuntimeContext* m_ctx;
};
......
......@@ -182,7 +182,8 @@ void runtime::gpu::CudaKernelBuilder::get_ew_collective_op(
}
//each thread calculate the whole reduction of one output
void runtime::gpu::CudaKernelBuilder::get_reduce_op(codegen::CodeWriter& writer,
void runtime::gpu::CudaKernelBuilder::get_reduce_to_nd_op(
codegen::CodeWriter& writer,
const std::string& name,
runtime::gpu::GPUKernelArgs& args,
const std::vector<std::string>& data_types,
......@@ -227,18 +228,20 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_op(codegen::CodeWriter& writer,
writer << "int idx" << last_r_idx << " = 0;\n";
writer << "uint32_t step = reduce_strides" << last_r_idx << ";\n";
// unroll last reduction axis
writer << "for(; idx" << last_r_idx << " < (reduce_shape" << last_r_idx
<< " >> 3); idx" << last_r_idx << "++)\n";
uint32_t unroll_num = 8;
uint32_t unroll_shift = 3;
writer << "for(; idx" << last_r_idx << " < (reduce_shape" << last_r_idx << " >> "
<< unroll_shift << "); idx" << last_r_idx << "++)\n";
writer.block_begin();
{
for (int k = 0; k < 8; k++)
for (int k = 0; k < unroll_num; k++)
{
writer << "r = " << reduce_op << "(r , in[reduce_idx]);\n";
writer << "reduce_idx += step;\n";
}
}
writer.block_end();
writer << "idx" << last_r_idx << " <<= 3;\n";
writer << "idx" << last_r_idx << " <<= " << unroll_shift << ";\n";
writer << "for(; idx" << last_r_idx << " < reduce_shape" << last_r_idx << "; idx"
<< last_r_idx << "++)\n";
writer.block_begin();
......@@ -260,6 +263,144 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_op(codegen::CodeWriter& writer,
return;
}
void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_op(
codegen::CodeWriter& writer,
const std::string& name,
runtime::gpu::GPUKernelArgs& args,
const std::vector<std::string>& data_types,
const std::string& reduce_op,
uint32_t block_size_x)
{
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
writer.block_begin();
{
writer << "extern __shared__ " << data_types[1] << " sdata[];\n";
writer << "uint32_t tid = threadIdx.x; \n";
writer << "uint32_t step = blockDim.x; \n";
writer << "sdata[tid] = 0;\n";
writer << "uint32_t in_idx = tid;\n";
writer << data_types[1] << " r = 0;\n";
writer << "if(in_idx < nthreads)\n";
writer.block_begin();
writer << "r = in[in_idx];\n";
writer << "in_idx += step;\n";
writer.block_end();
//accumulate reduction to blockDim.x threads
uint32_t unroll_num = 8;
writer << "while(in_idx + (step * " << unroll_num - 1 << ") < nthreads)\n";
writer.block_begin();
{
for (int i = 0; i < unroll_num; i++)
{
writer << "r = " << reduce_op << "(r , in[in_idx]);\n";
writer << "in_idx += step;\n";
}
}
writer.block_end();
writer << "while(in_idx < nthreads)\n";
writer.block_begin();
{
writer << "r = " << reduce_op << "(r , in[in_idx]);\n";
writer << "in_idx += step;\n";
}
writer.block_end();
//accumulate 32 threads for each warp
for (int i = 16; i >= 1; i >>= 1)
{
if (block_size_x > i)
{
writer << "r = " << reduce_op << "(r, __shfl_down_sync(0xffffffff, r, " << i
<< ", 32));\n";
}
}
if (block_size_x > 32)
{
writer << "uint32_t lane_idx = tid & 0x1f; \n";
writer << "uint32_t warp_idx = tid >> 5; \n";
writer << "if(lane_idx == 0)\n";
writer.block_begin();
{
writer << "sdata[warp_idx] = r;\n";
}
writer.block_end();
writer << "__syncthreads();\n";
uint32_t warp_size = block_size_x >> 5;
writer << "if(tid < " << warp_size << ")\n";
writer.block_begin();
{
writer << "r = sdata[tid];\n";
}
writer.block_end();
//accumulate 32 threads
for (int i = 16; i >= 1; i >>= 1)
{
if (warp_size > i)
{
writer << "r = " << reduce_op << "(r, __shfl_down_sync(0xffffffff, r, " << i
<< ", 32));\n";
}
}
}
writer << "if(tid == 0)\n";
writer.block_begin();
{
writer << "out[0] = r;\n";
}
writer.block_end();
}
writer.block_end();
return;
}
void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_acc_op(
codegen::CodeWriter& writer,
const std::string& name,
runtime::gpu::GPUKernelArgs& args,
const std::vector<std::string>& data_types,
const std::string& reduce_op)
{
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
writer.block_begin();
{
writer << "uint32_t tid = blockDim.x*blockIdx.x + threadIdx.x;\n";
writer << "uint32_t step = gridDim.x * blockDim.x; \n";
writer << "uint32_t in_idx = tid;\n";
writer << data_types[1] << " r = 0;\n";
writer << "if(in_idx < nthreads)\n";
writer.block_begin();
writer << "r = in[in_idx];\n";
writer << "in_idx += step;\n";
writer.block_end();
//accumulate reduction to step threads
uint32_t unroll_num = 8;
writer << "while(in_idx + (step * " << unroll_num - 1 << ") < nthreads)\n";
writer.block_begin();
{
for (int i = 0; i < unroll_num; i++)
{
writer << "r = " << reduce_op << "(r , in[in_idx]);\n";
writer << "in_idx += step;\n";
}
}
writer.block_end();
writer << "while(in_idx < nthreads)\n";
writer.block_begin();
{
writer << "r = " << reduce_op << "(r , in[in_idx]);\n";
writer << "in_idx += step;\n";
}
writer.block_end();
writer << "out[tid] = r;\n";
}
writer.block_end();
return;
}
void runtime::gpu::CudaKernelBuilder::get_broadcast_op(codegen::CodeWriter& writer,
const std::string& name,
runtime::gpu::GPUKernelArgs& args,
......
......@@ -59,7 +59,7 @@ namespace ngraph
const std::array<std::string, 2>& data_types,
size_t rank);
static void get_reduce_op(codegen::CodeWriter& writer,
static void get_reduce_to_nd_op(codegen::CodeWriter& writer,
const std::string& name,
runtime::gpu::GPUKernelArgs& args,
const std::vector<std::string>& data_types,
......@@ -67,6 +67,23 @@ namespace ngraph
size_t out_rank,
size_t reduce_rank);
//using one block with at most 512 threads to reduce to scalar.
static void get_reduce_to_scalar_op(codegen::CodeWriter& writer,
const std::string& name,
runtime::gpu::GPUKernelArgs& args,
const std::vector<std::string>& data_types,
const std::string& reduce_op,
uint32_t block_size_x);
//This is the preprocess to reduce to scalar if the data size is large than a number.
//The number can be tuned based on hardware.
//This cuda kernel will accumulate reduction to a certain number of bins depends on hardware.
static void get_reduce_to_scalar_acc_op(codegen::CodeWriter& writer,
const std::string& name,
runtime::gpu::GPUKernelArgs& args,
const std::vector<std::string>& data_types,
const std::string& reduce_op);
static void get_slice_op(codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 2>& data_types,
......
......@@ -851,21 +851,6 @@ namespace ngraph
{
kernel::emit_memcpyDtD(writer, out[0], args[0]);
}
else if (out[0].get_shape().size() == 0)
{
auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter();
auto sum_index =
cudnn_emitter->build_reduce_forward(CUDNN_REDUCE_TENSOR_ADD,
out[0].get_type(),
args[0].get_shape(),
sum->get_reduction_axes());
writer << "gpu::invoke_primitive(ctx, " << sum_index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
writer << ");\n";
}
else
{
auto axes_set = sum->get_reduction_axes();
......@@ -880,7 +865,10 @@ namespace ngraph
auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter();
auto sum_index = cuda_emitter->build_reduce<ngraph::op::Add>(
dtypes, args[0].get_shape(), axes_vec);
dtypes,
out[0].get_element_type().size(),
args[0].get_shape(),
axes_vec);
writer << "gpu::invoke_primitive(ctx, " << sum_index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
......
......@@ -18,8 +18,8 @@
#include <cinttypes>
#include <cmath>
#include <cstdlib>
#include <random>
#include <string>
#include "gtest/gtest.h"
#include "ngraph/autodiff/adjoints.hpp"
......@@ -37,6 +37,8 @@
#include "util/test_control.hpp"
#include "util/test_tools.hpp"
static std::mt19937_64 random_generator;
using namespace std;
using namespace ngraph;
......@@ -3502,6 +3504,33 @@ NGRAPH_TEST(${BACKEND_NAME}, sum_to_scalar)
EXPECT_EQ((vector<float>{1, 2, 3, 4}), read_vector<float>(a));
}
NGRAPH_TEST(${BACKEND_NAME}, sum_large_1d_to_scalar)
{
Shape shape{1000000};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{0}), op::ParameterVector{A});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
random_generator.seed(2);
vector<float> v_a(1000000, 0);
double r = 0;
for (int i = 0; i < 1000000; i++)
{
v_a[i] = static_cast<float>(random_generator() % 255);
r += static_cast<double>(v_a[i]);
}
auto a = backend->create_tensor(element::f32, shape);
copy_data(a, v_a);
auto result = backend->create_tensor(element::f32, Shape{});
backend->call_with_validate(f, {result}, {a});
EXPECT_TRUE(
test::all_close_f(vector<float>{static_cast<float>(r)}, read_vector<float>(result)));
}
NGRAPH_TEST(${BACKEND_NAME}, sum_matrix_columns)
{
Shape shape_a{3, 2};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment