Commit 32398641 authored by Fenglei's avatar Fenglei Committed by Robert Kimball

nvgpu cuda reduce (#1988)

* change reduce using cuda, add support for AND, OR

* fix bug and format

* remove unused code

* style

* change reduce_op to reduce_func to avoid shadow, thansk Ayan.

* using dynamic_pointer_cast
parent 77899668
......@@ -1651,7 +1651,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_to_nd(const std::vector<std::stri
size_t reduce_rank = reduce_axis.size();
size_t out_rank = rank - reduce_rank;
// assumes NC{d1,...,dn} format
std::string kernel_name = "reduce_nd_" + join(dtypes, "_");
std::string kernel_name = "reduce_nd_" + join(dtypes, "_") + "_" + op;
kernel_name +=
"_ri_" + std::to_string(input_shape.size()) + "_rr_" + std::to_string(reduce_axis.size());
std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');
......@@ -1749,7 +1749,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_to_scalar(const std::vector<std::
const char* kernel)
{
// assumes NC{d1,...,dn} format
std::string kernel_name = "reduce_scalar_" + join(dtypes, "_");
std::string kernel_name = "reduce_scalar_" + join(dtypes, "_") + "_" + op;
std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');
std::stringstream ss;
......@@ -1825,7 +1825,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce_to_scalar_acc(const std::vector<s
const char* kernel)
{
// assumes NC{d1,...,dn} format
std::string kernel_name = "reduce_acc_" + join(dtypes, "_");
std::string kernel_name = "reduce_acc_" + join(dtypes, "_") + "_" + op;
std::replace(kernel_name.begin(), kernel_name.end(), ' ', '_');
std::stringstream ss;
......@@ -1894,7 +1894,7 @@ size_t runtime::gpu::CUDAEmitter::build_reduce(const std::vector<std::string>& d
size_t reduce_rank = reduce_axis.size();
size_t out_rank = rank - reduce_rank;
// assumes NC{d1,...,dn} format
std::string kernel_name = "reduce_" + join(dtypes, "_");
std::string kernel_name = "reduce_" + join(dtypes, "_") + "_" + op;
if (out_rank != 0)
{
kernel_name += "_ri_" + std::to_string(input_shape.size()) + "_rr_" +
......
......@@ -352,7 +352,6 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_nd_op(
writer << "uint32_t dim_idx_generator = tid;\n";
}
writer << "uint32_t in_idx = 0;\n";
writer << data_types[1] << " r = 0;\n";
// loop through all reduction axis
for (int64_t i = 0; i < static_cast<int64_t>(out_rank); i++)
......@@ -361,6 +360,8 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_nd_op(
<< ") * non_reduce_strides" << i << ";\n";
writer << "dim_idx_generator %= out_strides" << i << ";\n";
}
writer << "uint32_t init_in_idx = in_idx;\n";
writer << data_types[1] << " r = in[init_in_idx];\n";
int64_t last_r_idx = static_cast<int64_t>(reduce_rank) - 1;
for (int64_t j = 0; j < last_r_idx; j++)
{
......@@ -374,13 +375,19 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_nd_op(
{
writer << "reduce_idx += idx" << j << " * reduce_strides" << j << ";\n";
}
writer << "int idx" << last_r_idx << " = 0;\n";
writer << "uint32_t step = reduce_strides" << last_r_idx << ";\n";
writer << "if(reduce_idx != init_in_idx)\n";
writer.block_begin();
{
writer << "r = " << reduce_op << "(r , in[reduce_idx]);\n";
}
writer.block_end();
writer << "reduce_idx += step;\n";
writer << "int idx" << last_r_idx << " = 1;\n";
// unroll last reduction axis
uint32_t unroll_num = 8;
uint32_t unroll_shift = 3;
writer << "for(; idx" << last_r_idx << " < (reduce_shape" << last_r_idx << " >> "
<< unroll_shift << "); idx" << last_r_idx << "++)\n";
writer << "for(; idx" << last_r_idx << " + " << unroll_num << " - 1 < reduce_shape"
<< last_r_idx << "; idx" << last_r_idx << " += " << unroll_num << ")\n";
writer.block_begin();
{
for (int k = 0; k < unroll_num; k++)
......@@ -390,7 +397,6 @@ void runtime::gpu::CudaKernelBuilder::get_reduce_to_nd_op(
}
}
writer.block_end();
writer << "idx" << last_r_idx << " <<= " << unroll_shift << ";\n";
writer << "for(; idx" << last_r_idx << " < reduce_shape" << last_r_idx << "; idx"
<< last_r_idx << "++)\n";
writer.block_begin();
......
......@@ -1022,12 +1022,22 @@ void runtime::gpu::GPU_Emitter::emit_Reduce(EMIT_ARGS)
}
else
{
// in current implementation:
// 1. reduction function should only have one op
// 2. the op should be in the op_map
// otherwise, throw an error message
cudnnReduceTensorOp_t reduce_tensor_op;
auto axes_set = reduce_op->get_reduction_axes();
ngraph::AxisVector axes_vec;
for (auto a : axes_set)
{
axes_vec.push_back(a);
}
std::vector<string> dtypes;
dtypes.push_back(args[0].get_type());
dtypes.push_back(out[0].get_type());
auto& cuda_emitter = external_function->get_primitive_emitter()->get_cuda_emitter();
auto reduction_function_ops = reduce_op->get_functions()[0]->get_ops();
size_t emitter_index;
// Reduction function should only have one op
std::shared_ptr<Node> reduce_func;
std::string op_name;
int op_count = 0;
for (auto op : reduction_function_ops)
{
......@@ -1036,38 +1046,52 @@ void runtime::gpu::GPU_Emitter::emit_Reduce(EMIT_ARGS)
continue;
}
op_count++;
// Work around a compiler warning (*node inside typeid may have effects
// with shared pointers, which is fine here but clang doesn't like it.)
auto& fn = *op;
auto f_ptr = reduce_map.find(type_index(typeid(fn)));
if (f_ptr == reduce_map.end())
{
throw runtime_error("reduce with function " + fn.get_name() +
" is not implement yet.");
}
else if (op_count != 1)
op_name = op->get_name();
reduce_func = op;
if (op_count != 1)
{
throw runtime_error("reduce with more than one op is not implement yet.");
}
else
{
reduce_tensor_op = f_ptr->second;
}
}
std::vector<element::Type> dtypes{args[0].get_element_type(),
out[0].get_element_type()};
auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter();
auto reduce_index =
cudnn_emitter->build_reduce_forward(reduce_tensor_op,
dtypes,
args[0].get_shape(),
reduce_op->get_reduction_axes(),
CUDNNEmitter::ReductionMode::Reduce);
if (dynamic_pointer_cast<ngraph::op::Add>(reduce_func))
{
emitter_index = cuda_emitter->build_reduce<ngraph::op::Add>(
dtypes, out[0].get_element_type().size(), args[0].get_shape(), axes_vec);
}
else if (dynamic_pointer_cast<ngraph::op::Multiply>(reduce_func))
{
emitter_index = cuda_emitter->build_reduce<ngraph::op::Multiply>(
dtypes, out[0].get_element_type().size(), args[0].get_shape(), axes_vec);
}
else if (dynamic_pointer_cast<ngraph::op::Maximum>(reduce_func))
{
emitter_index = cuda_emitter->build_reduce<ngraph::op::Maximum>(
dtypes, out[0].get_element_type().size(), args[0].get_shape(), axes_vec);
}
else if (dynamic_pointer_cast<ngraph::op::Minimum>(reduce_func))
{
emitter_index = cuda_emitter->build_reduce<ngraph::op::Minimum>(
dtypes, out[0].get_element_type().size(), args[0].get_shape(), axes_vec);
}
else if (dynamic_pointer_cast<ngraph::op::And>(reduce_func))
{
emitter_index = cuda_emitter->build_reduce<ngraph::op::And>(
dtypes, out[0].get_element_type().size(), args[0].get_shape(), axes_vec);
}
else if (dynamic_pointer_cast<ngraph::op::Or>(reduce_func))
{
emitter_index = cuda_emitter->build_reduce<ngraph::op::Or>(
dtypes, out[0].get_element_type().size(), args[0].get_shape(), axes_vec);
}
else
{
throw runtime_error("reduce with function " + op_name +
" is not implement yet.");
}
writer << "void* input[] = {" << node_names(args) << "};\n";
writer << "void* output[] = {" << node_names(out) << "};\n";
writer << "gpu::invoke_primitive(ctx, " << reduce_index << ", input, output);\n";
writer << "gpu::invoke_primitive(ctx, " << emitter_index << ", input, output);\n";
}
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment