Commit 5f40d957 authored by Fenglei's avatar Fenglei Committed by Scott Cyphers

nvgpu reduce to scalar optimization (#1491)

* add cuda reduce

* clang format

* fix bugs

* fix bug

* add 1d reduce

* clang format

* fix bugs

* unroll loop

* remove debug info

* revert tests

* unroll 1D reduce op

* add comments

* using cudnn for nd to scalar reduction

* remove cuda 1d reduction since cudnn version is faster

* remove 1D kernel

* fix bugs

* 1d multi block size

* remove debug

* change kernel name

* add reduce to scalar optimization, add test

* fix bugs and tune parameters

* clang format

* update comments

* update comments

* update comments

* clang format

* update comments

* remove wrong comments, apply clang format

* resolve Bob's comment

* clang format

* pass shared mem size from cuLaunchKernel, set unroll loop size through host code

* remove unused code.clang format

* change reduce to thread with shfl for each warp first

* add seed

* unroll size
parent 8fdefa52
...@@ -1323,16 +1323,19 @@ size_t runtime::gpu::CUDAEmitter::build_softmax_divide(const std::vector<std::st ...@@ -1323,16 +1323,19 @@ size_t runtime::gpu::CUDAEmitter::build_softmax_divide(const std::vector<std::st
return primitive_index; return primitive_index;
} }
size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& dtypes, size_t runtime::gpu::CUDAEmitter::build_reduce_to_nd(const std::vector<std::string>& dtypes,
NVShape input_shape, NVShape input_shape,
NVShape reduce_axis, NVShape reduce_axis,
const char* op, const char* op,
const char* kernel) const char* kernel)
{ {
size_t rank = input_shape.size();
size_t reduce_rank = reduce_axis.size();
size_t out_rank = rank - reduce_rank;
// assumes NC{d1,...,dn} format // assumes NC{d1,...,dn} format
std::string kernel_name = "reduce_" + join(dtypes, "_") + "_ri_" + std::string kernel_name = "reduce_nd_" + join(dtypes, "_");
std::to_string(input_shape.size()) + "_rr_" + kernel_name +=
std::to_string(reduce_axis.size()); "_ri_" + std::to_string(input_shape.size()) + "_rr_" + std::to_string(reduce_axis.size());
std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_'); std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');
std::stringstream ss; std::stringstream ss;
...@@ -1345,9 +1348,6 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& d ...@@ -1345,9 +1348,6 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& d
return primitive_index; return primitive_index;
} }
size_t rank = input_shape.size();
size_t reduce_rank = reduce_axis.size();
size_t out_rank = rank - reduce_rank;
NVShape reduce_flag(rank, 0); NVShape reduce_flag(rank, 0);
for (auto a : reduce_axis) for (auto a : reduce_axis)
{ {
...@@ -1372,7 +1372,6 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& d ...@@ -1372,7 +1372,6 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& d
} }
} }
NVShape output_strides = row_major_strides(output_shape); NVShape output_strides = row_major_strides(output_shape);
uint32_t nthreads = static_cast<uint32_t>(shape_size(output_shape)); uint32_t nthreads = static_cast<uint32_t>(shape_size(output_shape));
// TODO: currently we set it to 64, will add tuning method later // TODO: currently we set it to 64, will add tuning method later
uint32_t block_size_x = 64; uint32_t block_size_x = 64;
...@@ -1398,7 +1397,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& d ...@@ -1398,7 +1397,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& d
CudaKernelBuilder::get_device_helper( CudaKernelBuilder::get_device_helper(
writer, op, kernel, {{dtypes[0], dtypes[0], dtypes[1]}}); writer, op, kernel, {{dtypes[0], dtypes[0], dtypes[1]}});
} }
runtime::gpu::CudaKernelBuilder::get_reduce_op( runtime::gpu::CudaKernelBuilder::get_reduce_to_nd_op(
writer, kernel_name, args, dtypes, op, out_rank, reduce_rank); writer, kernel_name, args, dtypes, op, out_rank, reduce_rank);
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code()); compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
} }
...@@ -1422,8 +1421,248 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& d ...@@ -1422,8 +1421,248 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& d
0)); 0));
debug_sync(); debug_sync();
}}); }});
primitive_index = this->m_primitive_emitter->insert(std::move(reduce));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
}
size_t runtime::gpu::CUDAEmitter::build_reduce_to_scalar(const std::vector<std::string>& dtypes,
const size_t data_bytes,
NVShape input_shape,
const char* op,
const char* kernel)
{
// assumes NC{d1,...,dn} format
std::string kernel_name = "reduce_scalar_" + join(dtypes, "_");
std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');
std::stringstream ss;
ss << kernel_name << "_s_" << join(input_shape, "_");
auto hash = ss.str();
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape));
uint32_t n = nthreads;
uint32_t block_size_x = 1;
while (n > 1)
{
block_size_x <<= 1;
n >>= 1;
}
block_size_x = fmin(512, block_size_x);
uint32_t shared_data_bytes = block_size_x * static_cast<uint32_t>(data_bytes);
kernel_name += "_b_" + std::to_string(block_size_x);
auto args = m_primitive_emitter->add_kernel_args();
args.add_placeholder(dtypes[0], "in")
.add_placeholder(dtypes[1], "out")
.add("nthreads", nthreads);
// if the kernel has not been compiled, build it
auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name);
if (compiled_kernel == nullptr)
{
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
writer << include_helpers();
if (kernel)
{
CudaKernelBuilder::get_device_helper(
writer, op, kernel, {{dtypes[0], dtypes[0], dtypes[1]}});
}
runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_op(
writer, kernel_name, args, dtypes, op, block_size_x);
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
}
std::unique_ptr<gpu::primitive> reduce(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
void** args_list = args.resolve_placeholder(0, &inputs[0])
.resolve_placeholder(1, &outputs[0])
.get_argument_list();
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
1,
1,
1,
block_size_x,
1,
1,
shared_data_bytes,
NULL,
args_list,
0));
debug_sync();
}});
primitive_index = this->m_primitive_emitter->insert(std::move(reduce));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
}
size_t runtime::gpu::CUDAEmitter::build_reduce_to_scalar_acc(const std::vector<std::string>& dtypes,
NVShape input_shape,
NVShape output_shape,
uint32_t block_size_x,
const char* op,
const char* kernel)
{
// assumes NC{d1,...,dn} format
std::string kernel_name = "reduce_acc_" + join(dtypes, "_");
std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');
std::stringstream ss;
ss << kernel_name << "_s_" << join(input_shape, "_");
auto hash = ss.str();
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape));
auto args = m_primitive_emitter->add_kernel_args();
args.add_placeholder(dtypes[0], "in")
.add_placeholder(dtypes[1], "out")
.add("nthreads", nthreads);
uint32_t aligned_grid_size_x = static_cast<uint32_t>(shape_size(output_shape)) / block_size_x;
auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name);
// if the kernel has not been compiled, build it
if (compiled_kernel == nullptr)
{
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
writer << include_helpers();
if (kernel)
{
CudaKernelBuilder::get_device_helper(
writer, op, kernel, {{dtypes[0], dtypes[0], dtypes[1]}});
}
runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_acc_op(
writer, kernel_name, args, dtypes, op);
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
}
std::unique_ptr<gpu::primitive> reduce_acc(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
void** args_list = args.resolve_placeholder(0, &inputs[0])
.resolve_placeholder(1, &outputs[0])
.get_argument_list();
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
aligned_grid_size_x,
1,
1,
block_size_x,
1,
1,
0,
NULL,
args_list,
0));
}});
primitive_index = this->m_primitive_emitter->insert(std::move(reduce_acc));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
}
size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& dtypes,
const size_t data_bytes,
NVShape input_shape,
NVShape reduce_axis,
const char* op,
const char* kernel)
{
size_t rank = input_shape.size();
size_t reduce_rank = reduce_axis.size();
size_t out_rank = rank - reduce_rank;
// assumes NC{d1,...,dn} format
std::string kernel_name = "reduce_" + join(dtypes, "_");
if (out_rank != 0)
{
kernel_name += "_ri_" + std::to_string(input_shape.size()) + "_rr_" +
std::to_string(reduce_axis.size());
}
std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');
std::stringstream ss;
ss << kernel_name << "_s_" << join(input_shape, "_") << "_axis_" << join(reduce_axis, "_");
auto hash = ss.str();
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
int num_SMs;
CUDA_RT_SAFE_CALL(cudaDeviceGetAttribute(&num_SMs, cudaDevAttrMultiProcessorCount, 0));
uint32_t block_size_x_acc = 256;
uint32_t nthreads_acc = num_SMs * block_size_x_acc;
//call reduce_to_nd
if (out_rank != 0)
{
size_t reduce_idx = build_reduce_to_nd(dtypes, input_shape, reduce_axis, op, kernel);
std::unique_ptr<gpu::primitive> reduce(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
gpu::invoke_primitive(m_ctx,
reduce_idx,
std::vector<void*>{inputs[0]}.data(),
std::vector<void*>{outputs[0]}.data());
}});
primitive_index = this->m_primitive_emitter->insert(std::move(reduce)); primitive_index = this->m_primitive_emitter->insert(std::move(reduce));
}
else
{
uint32_t nthreads = static_cast<uint32_t>(shape_size(input_shape));
//if the data size is large, call reduce_to_scalar_acc first and then reduce_to_scalar.
//other wise, call reduce to scalar directly.
const uint32_t unroll_size = 8;
if (nthreads > nthreads_acc * (unroll_size + 1))
{
NVShape acc_output_shape{nthreads_acc};
size_t reduce_scalar_acc_idx = build_reduce_to_scalar_acc(
dtypes, input_shape, acc_output_shape, block_size_x_acc, op, kernel);
size_t reduce_scalar_idx =
build_reduce_to_scalar(dtypes, data_bytes, acc_output_shape, op, kernel);
// get an allocator for transient per kernel gpu memory
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
size_t idx_workspace = allocator.reserve_workspace(nthreads_acc * data_bytes);
std::unique_ptr<gpu::primitive> reduce_scalar_acc(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
void* buffer = runtime::gpu::invoke_memory_primitive(m_ctx, idx_workspace);
gpu::invoke_primitive(m_ctx,
reduce_scalar_acc_idx,
std::vector<void*>{inputs[0]}.data(),
std::vector<void*>{buffer}.data());
gpu::invoke_primitive(m_ctx,
reduce_scalar_idx,
std::vector<void*>{buffer}.data(),
std::vector<void*>{outputs[0]}.data());
}});
primitive_index = this->m_primitive_emitter->insert(std::move(reduce_scalar_acc));
}
else
{
size_t reduce_scalar_idx =
build_reduce_to_scalar(dtypes, data_bytes, input_shape, op, kernel);
std::unique_ptr<gpu::primitive> reduce_scalar(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
gpu::invoke_primitive(m_ctx,
reduce_scalar_idx,
std::vector<void*>{inputs[0]}.data(),
std::vector<void*>{outputs[0]}.data());
}});
primitive_index = this->m_primitive_emitter->insert(std::move(reduce_scalar));
}
}
m_primitive_emitter->cache(hash, primitive_index); m_primitive_emitter->cache(hash, primitive_index);
return primitive_index; return primitive_index;
} }
......
...@@ -117,11 +117,13 @@ namespace ngraph ...@@ -117,11 +117,13 @@ namespace ngraph
template <typename T> template <typename T>
size_t build_reduce(const std::vector<std::string>& dtypes, size_t build_reduce(const std::vector<std::string>& dtypes,
NVShape tensor_shape, const size_t data_bytes,
NVShape input_shape,
NVShape reduce_axis) NVShape reduce_axis)
{ {
return build_reduce(dtypes, return build_reduce(dtypes,
tensor_shape, data_bytes,
input_shape,
reduce_axis, reduce_axis,
CudaOpMap<T>::op, CudaOpMap<T>::op,
CudaOpMap<T>::math_kernel); CudaOpMap<T>::math_kernel);
...@@ -194,10 +196,31 @@ namespace ngraph ...@@ -194,10 +196,31 @@ namespace ngraph
const char* reduce_op, const char* reduce_op,
bool save_elementwise); bool save_elementwise);
size_t build_reduce(const std::vector<std::string>& dtypes, size_t build_reduce(const std::vector<std::string>& dtypes,
NVShape tensor_shape, const size_t data_bytes,
NVShape input_shape,
NVShape reduce_axis,
const char* op,
const char* kernel);
size_t build_reduce_to_nd(const std::vector<std::string>& dtypes,
NVShape input_shape,
NVShape reduce_axis, NVShape reduce_axis,
const char* op, const char* op,
const char* kernel); const char* kernel);
size_t build_reduce_to_scalar(const std::vector<std::string>& dtypes,
const size_t data_bytes,
NVShape input_shape,
const char* op,
const char* kernel);
//This is the preprocess for reduce to scalar if the data size is large than a number.
//The number can be tuned based on hardware.
//This cuda kernel will accumulate reduction to a certain number of bins depends on hardware.
size_t build_reduce_to_scalar_acc(const std::vector<std::string>& dtypes,
NVShape input_shape,
NVShape output_shape,
uint32_t block_size_x,
const char* op,
const char* kernel);
GPUPrimitiveEmitter* m_primitive_emitter; GPUPrimitiveEmitter* m_primitive_emitter;
GPURuntimeContext* m_ctx; GPURuntimeContext* m_ctx;
}; };
......
...@@ -182,7 +182,8 @@ void runtime::gpu::CudaKernelBuilder::get_ew_collective_op( ...@@ -182,7 +182,8 @@ void runtime::gpu::CudaKernelBuilder::get_ew_collective_op(
} }
//each thread calculate the whole reduction of one output //each thread calculate the whole reduction of one output
void runtime::gpu::CudaKernelBuilder::get_reduce_op(codegen::CodeWriter& writer, void runtime::gpu::CudaKernelBuilder::get_reduce_to_nd_op(
codegen::CodeWriter& writer,
const std::string& name, const std::string& name,
runtime::gpu::GPUKernelArgs& args, runtime::gpu::GPUKernelArgs& args,
const std::vector<std::string>& data_types, const std::vector<std::string>& data_types,
...@@ -227,18 +228,20 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_op(codegen::CodeWriter& writer, ...@@ -227,18 +228,20 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_op(codegen::CodeWriter& writer,
writer << "int idx" << last_r_idx << " = 0;\n"; writer << "int idx" << last_r_idx << " = 0;\n";
writer << "uint32_t step = reduce_strides" << last_r_idx << ";\n"; writer << "uint32_t step = reduce_strides" << last_r_idx << ";\n";
// unroll last reduction axis // unroll last reduction axis
writer << "for(; idx" << last_r_idx << " < (reduce_shape" << last_r_idx uint32_t unroll_num = 8;
<< " >> 3); idx" << last_r_idx << "++)\n"; uint32_t unroll_shift = 3;
writer << "for(; idx" << last_r_idx << " < (reduce_shape" << last_r_idx << " >> "
<< unroll_shift << "); idx" << last_r_idx << "++)\n";
writer.block_begin(); writer.block_begin();
{ {
for (int k = 0; k < 8; k++) for (int k = 0; k < unroll_num; k++)
{ {
writer << "r = " << reduce_op << "(r , in[reduce_idx]);\n"; writer << "r = " << reduce_op << "(r , in[reduce_idx]);\n";
writer << "reduce_idx += step;\n"; writer << "reduce_idx += step;\n";
} }
} }
writer.block_end(); writer.block_end();
writer << "idx" << last_r_idx << " <<= 3;\n"; writer << "idx" << last_r_idx << " <<= " << unroll_shift << ";\n";
writer << "for(; idx" << last_r_idx << " < reduce_shape" << last_r_idx << "; idx" writer << "for(; idx" << last_r_idx << " < reduce_shape" << last_r_idx << "; idx"
<< last_r_idx << "++)\n"; << last_r_idx << "++)\n";
writer.block_begin(); writer.block_begin();
...@@ -260,6 +263,144 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_op(codegen::CodeWriter& writer, ...@@ -260,6 +263,144 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_op(codegen::CodeWriter& writer,
return; return;
} }
void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_op(
codegen::CodeWriter& writer,
const std::string& name,
runtime::gpu::GPUKernelArgs& args,
const std::vector<std::string>& data_types,
const std::string& reduce_op,
uint32_t block_size_x)
{
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
writer.block_begin();
{
writer << "extern __shared__ " << data_types[1] << " sdata[];\n";
writer << "uint32_t tid = threadIdx.x; \n";
writer << "uint32_t step = blockDim.x; \n";
writer << "sdata[tid] = 0;\n";
writer << "uint32_t in_idx = tid;\n";
writer << data_types[1] << " r = 0;\n";
writer << "if(in_idx < nthreads)\n";
writer.block_begin();
writer << "r = in[in_idx];\n";
writer << "in_idx += step;\n";
writer.block_end();
//accumulate reduction to blockDim.x threads
uint32_t unroll_num = 8;
writer << "while(in_idx + (step * " << unroll_num - 1 << ") < nthreads)\n";
writer.block_begin();
{
for (int i = 0; i < unroll_num; i++)
{
writer << "r = " << reduce_op << "(r , in[in_idx]);\n";
writer << "in_idx += step;\n";
}
}
writer.block_end();
writer << "while(in_idx < nthreads)\n";
writer.block_begin();
{
writer << "r = " << reduce_op << "(r , in[in_idx]);\n";
writer << "in_idx += step;\n";
}
writer.block_end();
//accumulate 32 threads for each warp
for (int i = 16; i >= 1; i >>= 1)
{
if (block_size_x > i)
{
writer << "r = " << reduce_op << "(r, __shfl_down_sync(0xffffffff, r, " << i
<< ", 32));\n";
}
}
if (block_size_x > 32)
{
writer << "uint32_t lane_idx = tid & 0x1f; \n";
writer << "uint32_t warp_idx = tid >> 5; \n";
writer << "if(lane_idx == 0)\n";
writer.block_begin();
{
writer << "sdata[warp_idx] = r;\n";
}
writer.block_end();
writer << "__syncthreads();\n";
uint32_t warp_size = block_size_x >> 5;
writer << "if(tid < " << warp_size << ")\n";
writer.block_begin();
{
writer << "r = sdata[tid];\n";
}
writer.block_end();
//accumulate 32 threads
for (int i = 16; i >= 1; i >>= 1)
{
if (warp_size > i)
{
writer << "r = " << reduce_op << "(r, __shfl_down_sync(0xffffffff, r, " << i
<< ", 32));\n";
}
}
}
writer << "if(tid == 0)\n";
writer.block_begin();
{
writer << "out[0] = r;\n";
}
writer.block_end();
}
writer.block_end();
return;
}
void runtime::gpu::CudaKernelBuilder::get_reduce_to_scalar_acc_op(
codegen::CodeWriter& writer,
const std::string& name,
runtime::gpu::GPUKernelArgs& args,
const std::vector<std::string>& data_types,
const std::string& reduce_op)
{
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
writer.block_begin();
{
writer << "uint32_t tid = blockDim.x*blockIdx.x + threadIdx.x;\n";
writer << "uint32_t step = gridDim.x * blockDim.x; \n";
writer << "uint32_t in_idx = tid;\n";
writer << data_types[1] << " r = 0;\n";
writer << "if(in_idx < nthreads)\n";
writer.block_begin();
writer << "r = in[in_idx];\n";
writer << "in_idx += step;\n";
writer.block_end();
//accumulate reduction to step threads
uint32_t unroll_num = 8;
writer << "while(in_idx + (step * " << unroll_num - 1 << ") < nthreads)\n";
writer.block_begin();
{
for (int i = 0; i < unroll_num; i++)
{
writer << "r = " << reduce_op << "(r , in[in_idx]);\n";
writer << "in_idx += step;\n";
}
}
writer.block_end();
writer << "while(in_idx < nthreads)\n";
writer.block_begin();
{
writer << "r = " << reduce_op << "(r , in[in_idx]);\n";
writer << "in_idx += step;\n";
}
writer.block_end();
writer << "out[tid] = r;\n";
}
writer.block_end();
return;
}
void runtime::gpu::CudaKernelBuilder::get_broadcast_op(codegen::CodeWriter& writer, void runtime::gpu::CudaKernelBuilder::get_broadcast_op(codegen::CodeWriter& writer,
const std::string& name, const std::string& name,
runtime::gpu::GPUKernelArgs& args, runtime::gpu::GPUKernelArgs& args,
......
...@@ -59,7 +59,7 @@ namespace ngraph ...@@ -59,7 +59,7 @@ namespace ngraph
const std::array<std::string, 2>& data_types, const std::array<std::string, 2>& data_types,
size_t rank); size_t rank);
static void get_reduce_op(codegen::CodeWriter& writer, static void get_reduce_to_nd_op(codegen::CodeWriter& writer,
const std::string& name, const std::string& name,
runtime::gpu::GPUKernelArgs& args, runtime::gpu::GPUKernelArgs& args,
const std::vector<std::string>& data_types, const std::vector<std::string>& data_types,
...@@ -67,6 +67,23 @@ namespace ngraph ...@@ -67,6 +67,23 @@ namespace ngraph
size_t out_rank, size_t out_rank,
size_t reduce_rank); size_t reduce_rank);
//using one block with at most 512 threads to reduce to scalar.
static void get_reduce_to_scalar_op(codegen::CodeWriter& writer,
const std::string& name,
runtime::gpu::GPUKernelArgs& args,
const std::vector<std::string>& data_types,
const std::string& reduce_op,
uint32_t block_size_x);
//This is the preprocess to reduce to scalar if the data size is large than a number.
//The number can be tuned based on hardware.
//This cuda kernel will accumulate reduction to a certain number of bins depends on hardware.
static void get_reduce_to_scalar_acc_op(codegen::CodeWriter& writer,
const std::string& name,
runtime::gpu::GPUKernelArgs& args,
const std::vector<std::string>& data_types,
const std::string& reduce_op);
static void get_slice_op(codegen::CodeWriter& writer, static void get_slice_op(codegen::CodeWriter& writer,
const std::string& name, const std::string& name,
const std::array<std::string, 2>& data_types, const std::array<std::string, 2>& data_types,
......
...@@ -851,21 +851,6 @@ namespace ngraph ...@@ -851,21 +851,6 @@ namespace ngraph
{ {
kernel::emit_memcpyDtD(writer, out[0], args[0]); kernel::emit_memcpyDtD(writer, out[0], args[0]);
} }
else if (out[0].get_shape().size() == 0)
{
auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter();
auto sum_index =
cudnn_emitter->build_reduce_forward(CUDNN_REDUCE_TENSOR_ADD,
out[0].get_type(),
args[0].get_shape(),
sum->get_reduction_axes());
writer << "gpu::invoke_primitive(ctx, " << sum_index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
writer << ");\n";
}
else else
{ {
auto axes_set = sum->get_reduction_axes(); auto axes_set = sum->get_reduction_axes();
...@@ -880,7 +865,10 @@ namespace ngraph ...@@ -880,7 +865,10 @@ namespace ngraph
auto& cuda_emitter = auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter(); external_function->get_primitive_emitter()->get_cuda_emitter();
auto sum_index = cuda_emitter->build_reduce<ngraph::op::Add>( auto sum_index = cuda_emitter->build_reduce<ngraph::op::Add>(
dtypes, args[0].get_shape(), axes_vec); dtypes,
out[0].get_element_type().size(),
args[0].get_shape(),
axes_vec);
writer << "gpu::invoke_primitive(ctx, " << sum_index << ", "; writer << "gpu::invoke_primitive(ctx, " << sum_index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), "; writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
......
...@@ -18,8 +18,8 @@ ...@@ -18,8 +18,8 @@
#include <cinttypes> #include <cinttypes>
#include <cmath> #include <cmath>
#include <cstdlib> #include <cstdlib>
#include <random>
#include <string> #include <string>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "ngraph/autodiff/adjoints.hpp" #include "ngraph/autodiff/adjoints.hpp"
...@@ -37,6 +37,8 @@ ...@@ -37,6 +37,8 @@
#include "util/test_control.hpp" #include "util/test_control.hpp"
#include "util/test_tools.hpp" #include "util/test_tools.hpp"
static std::mt19937_64 random_generator;
using namespace std; using namespace std;
using namespace ngraph; using namespace ngraph;
...@@ -3502,6 +3504,33 @@ NGRAPH_TEST(${BACKEND_NAME}, sum_to_scalar) ...@@ -3502,6 +3504,33 @@ NGRAPH_TEST(${BACKEND_NAME}, sum_to_scalar)
EXPECT_EQ((vector<float>{1, 2, 3, 4}), read_vector<float>(a)); EXPECT_EQ((vector<float>{1, 2, 3, 4}), read_vector<float>(a));
} }
NGRAPH_TEST(${BACKEND_NAME}, sum_large_1d_to_scalar)
{
Shape shape{1000000};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{0}), op::ParameterVector{A});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
random_generator.seed(2);
vector<float> v_a(1000000, 0);
double r = 0;
for (int i = 0; i < 1000000; i++)
{
v_a[i] = static_cast<float>(random_generator() % 255);
r += static_cast<double>(v_a[i]);
}
auto a = backend->create_tensor(element::f32, shape);
copy_data(a, v_a);
auto result = backend->create_tensor(element::f32, Shape{});
backend->call_with_validate(f, {result}, {a});
EXPECT_TRUE(
test::all_close_f(vector<float>{static_cast<float>(r)}, read_vector<float>(result)));
}
NGRAPH_TEST(${BACKEND_NAME}, sum_matrix_columns) NGRAPH_TEST(${BACKEND_NAME}, sum_matrix_columns)
{ {
Shape shape_a{3, 2}; Shape shape_a{3, 2};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment