Commit 6679c233 authored by Fenglei's avatar Fenglei Committed by Scott Cyphers

nvgpu reduction optimization (#1455)

* add cuda reduce

* clang format

* fix bugs

* fix bug

* add 1d reduce

* clang format

* fix bugs

* unroll loop

* remove debug info

* revert tests

* unroll 1D reduce op

* add comments

* using cudnn for nd to scalar reduction

* remove cuda 1d reduction since cudnn version is faster

* remove 1D kernel

* fix variable name

* resolve Chris's comments

* non_reduce_in_strides to non_reduce_strides
parent 2655f5e0
......@@ -1323,6 +1323,111 @@ size_t runtime::gpu::CUDAEmitter::build_softmax_divide(const std::vector<std::st
return primitive_index;
}
size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& dtypes,
NVShape input_shape,
NVShape reduce_axis,
const char* op,
const char* kernel)
{
// assumes NC{d1,...,dn} format
std::string kernel_name = "reduce_" + join(dtypes, "_") + "_ri_" +
std::to_string(input_shape.size()) + "_rr_" +
std::to_string(reduce_axis.size());
std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');
std::stringstream ss;
ss << kernel_name << "_s_" << join(input_shape, "_") << "_axis_" << join(reduce_axis, "_");
auto hash = ss.str();
// check if the requested kernel is already an inserted primitive
size_t primitive_index = m_primitive_emitter->lookup(hash);
if (primitive_index != std::numeric_limits<size_t>::max())
{
return primitive_index;
}
size_t rank = input_shape.size();
size_t reduce_rank = reduce_axis.size();
size_t out_rank = rank - reduce_rank;
NVShape reduce_flag(rank, 0);
for (auto a : reduce_axis)
{
reduce_flag[a] = 1;
}
NVShape output_shape;
NVShape non_reduce_strides;
NVShape reduce_shape;
NVShape reduce_strides;
NVShape input_strides = row_major_strides(input_shape);
for (int i = 0; i < rank; i++)
{
if (reduce_flag[i] != 0)
{
reduce_shape.push_back(input_shape[i]);
reduce_strides.push_back(input_strides[i]);
}
else
{
non_reduce_strides.push_back(input_strides[i]);
output_shape.push_back(input_shape[i]);
}
}
NVShape output_strides = row_major_strides(output_shape);
uint32_t nthreads = static_cast<uint32_t>(shape_size(output_shape));
//TODO: currently we set it to 64, will add tuning method later
uint32_t block_size_x = 64;
uint32_t aligned_grid_size_x = align_to_block_size(nthreads, block_size_x);
auto args = m_primitive_emitter->add_kernel_args();
args.add_placeholder(dtypes[0], "in")
.add_placeholder(dtypes[1], "out")
.add("out_strides", output_strides)
.add("non_reduce_strides", non_reduce_strides)
.add("reduce_shape", reduce_shape)
.add("reduce_strides", reduce_strides)
.add("nthreads", nthreads);
// if the kernel has not been compiled, build it
auto compiled_kernel = m_ctx->compiled_kernel_pool->get(kernel_name);
if (compiled_kernel == nullptr)
{
codegen::CodeWriter writer;
CudaKernelBuilder::add_pod_typedefs(writer);
writer << include_helpers();
if (kernel)
{
CudaKernelBuilder::get_device_helper(
writer, op, kernel, {{dtypes[0], dtypes[0], dtypes[1]}});
}
runtime::gpu::CudaKernelBuilder::get_reduce_op(
writer, kernel_name, args, dtypes, op, out_rank, reduce_rank);
compiled_kernel = m_ctx->compiled_kernel_pool->set(kernel_name, writer.get_code());
}
std::unique_ptr<gpu::primitive> reduce(
new gpu::primitive{[=](void** inputs, void** outputs) mutable {
void** args_list = args.resolve_placeholder(0, &inputs[0])
.resolve_placeholder(1, &outputs[0])
.get_argument_list();
CUDA_SAFE_CALL(cuLaunchKernel(*compiled_kernel.get(),
aligned_grid_size_x,
1,
1,
block_size_x,
1,
1,
0,
NULL,
args_list,
0));
debug_sync();
}});
primitive_index = this->m_primitive_emitter->insert(std::move(reduce));
m_primitive_emitter->cache(hash, primitive_index);
return primitive_index;
}
size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Softmax* node)
{
auto& args = node->get_inputs();
......
......@@ -115,6 +115,18 @@ namespace ngraph
dtypes, tensor_shape, CudaOpMap<T>::op, CudaOpMap<T>::math_kernel);
}
template <typename T>
size_t build_reduce(const std::vector<std::string>& dtypes,
NVShape tensor_shape,
NVShape reduce_axis)
{
return build_reduce(dtypes,
tensor_shape,
reduce_axis,
CudaOpMap<T>::op,
CudaOpMap<T>::math_kernel);
}
template <typename ELEMENTWISE_OP_TYPE, typename REDUCE_OP_TYPE = ngraph::op::Nop>
size_t build_elementwise_collective(const std::vector<std::string>& dtypes,
NVShape tensor_shape,
......@@ -181,7 +193,11 @@ namespace ngraph
const char* kernel,
const char* reduce_op,
bool save_elementwise);
size_t build_reduce(const std::vector<std::string>& dtypes,
NVShape tensor_shape,
NVShape reduce_axis,
const char* op,
const char* kernel);
GPUPrimitiveEmitter* m_primitive_emitter;
GPURuntimeContext* m_ctx;
};
......
......@@ -181,6 +181,85 @@ void runtime::gpu::CudaKernelBuilder::get_ew_collective_op(
return;
}
//each thread calculate the whole reduction of one output
void runtime::gpu::CudaKernelBuilder::get_reduce_op(codegen::CodeWriter& writer,
const std::string& name,
runtime::gpu::GPUKernelArgs& args,
const std::vector<std::string>& data_types,
const std::string& reduce_op,
size_t out_rank,
size_t reduce_rank)
{
writer << "extern \"C\" __global__ void cuda_" << name << args.get_input_signature();
writer.block_begin();
{
writer << "uint32_t tid = blockIdx.x * blockDim.x + threadIdx.x; \n";
writer << "if (tid < nthreads)\n";
writer.block_begin();
{
if (out_rank > 0)
{
writer << "uint32_t dim_idx_generator = tid;\n";
}
writer << "uint32_t in_idx = 0;\n";
writer << data_types[1] << " r = 0;\n";
//loop through all reduction axis
for (int64_t i = 0; i < static_cast<int64_t>(out_rank); i++)
{
writer << "in_idx += (dim_idx_generator / out_strides" << i
<< ") * non_reduce_strides" << i << ";\n";
writer << "dim_idx_generator %= out_strides" << i << ";\n";
}
int64_t last_r_idx = static_cast<int64_t>(reduce_rank) - 1;
for (int64_t j = 0; j < last_r_idx; j++)
{
writer << "for(int idx" << j << " = 0; idx" << j << "< reduce_shape" << j << "; idx"
<< j << "++)\n";
writer.block_begin();
}
{
writer << "uint32_t reduce_idx = in_idx;\n";
for (int64_t j = 0; j < last_r_idx; j++)
{
writer << "reduce_idx += idx" << j << " * reduce_strides" << j << ";\n";
}
writer << "int idx" << last_r_idx << " = 0;\n";
writer << "uint32_t step = reduce_strides" << last_r_idx << ";\n";
//unroll last reduction axis
writer << "for(; idx" << last_r_idx << " < (reduce_shape" << last_r_idx
<< " >> 3); idx" << last_r_idx << "++)\n";
writer.block_begin();
{
for (int k = 0; k < 8; k++)
{
writer << "r = " << reduce_op << "(r , in[reduce_idx]);\n";
writer << "reduce_idx += step;\n";
}
}
writer.block_end();
writer << "idx" << last_r_idx << " <<= 3;\n";
writer << "for(; idx" << last_r_idx << " < reduce_shape" << last_r_idx << "; idx"
<< last_r_idx << "++)\n";
writer.block_begin();
{
writer << "r = " << reduce_op << "(r , in[reduce_idx]);\n";
writer << "reduce_idx += step;\n";
}
writer.block_end();
}
for (int64_t j = 0; j < last_r_idx; j++)
{
writer.block_end();
}
writer << "out[tid] = r;\n";
}
writer.block_end();
}
writer.block_end();
return;
}
void runtime::gpu::CudaKernelBuilder::get_broadcast_op(codegen::CodeWriter& writer,
const std::string& name,
runtime::gpu::GPUKernelArgs& args,
......
......@@ -59,6 +59,14 @@ namespace ngraph
const std::array<std::string, 2>& data_types,
size_t rank);
static void get_reduce_op(codegen::CodeWriter& writer,
const std::string& name,
runtime::gpu::GPUKernelArgs& args,
const std::vector<std::string>& data_types,
const std::string& reduce_op,
size_t out_rank,
size_t reduce_rank);
static void get_slice_op(codegen::CodeWriter& writer,
const std::string& name,
const std::array<std::string, 2>& data_types,
......
......@@ -851,8 +851,7 @@ namespace ngraph
{
kernel::emit_memcpyDtD(writer, out[0], args[0]);
}
// descriptors for tensors with <= 4 dimensions
else
else if (out[0].get_shape().size() == 0)
{
auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter();
......@@ -867,6 +866,27 @@ namespace ngraph
writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
writer << ");\n";
}
else
{
auto axes_set = sum->get_reduction_axes();
ngraph::AxisVector axes_vec;
for (auto a : axes_set)
{
axes_vec.push_back(a);
}
std::vector<std::string> dtypes;
dtypes.push_back(args[0].get_type());
dtypes.push_back(out[0].get_type());
auto& cuda_emitter =
external_function->get_primitive_emitter()->get_cuda_emitter();
auto sum_index = cuda_emitter->build_reduce<ngraph::op::Add>(
dtypes, args[0].get_shape(), axes_vec);
writer << "gpu::invoke_primitive(ctx, " << sum_index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
writer << ");\n";
}
}
}
writer.block_end();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment