Commit b74742ec authored by Fenglei's avatar Fenglei Committed by Robert Kimball

add gpu reduction (#985)

* add reduce op

* fix bug

* fix bug and enable tests

* hack solution to get reduction function in reduct op

* hack version working on all tests

* fixed the reduction checking process
parent c2b0b066
...@@ -100,6 +100,41 @@ ...@@ -100,6 +100,41 @@
#include "ngraph/util.hpp" #include "ngraph/util.hpp"
using namespace std; using namespace std;
// reduction function supported by GPU
// CUDNN_REDUCE_TENSOR_ADD
// CUDNN_REDUCE_TENSOR_MUL
// CUDNN_REDUCE_TENSOR_MIN
// CUDNN_REDUCE_TENSOR_MAX
// CUDNN_REDUCE_TENSOR_AMAX
// CUDNN_REDUCE_TENSOR_AVG
// CUDNN_REDUCE_TENSOR_NORM1
// CUDNN_REDUCE_TENSOR_NORM2
// CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS
#define TI(x) type_index(typeid(x))
static const std::unordered_map<std::type_index, cudnnReduceTensorOp_t> reduce_map{
{TI(ngraph::op::Add), CUDNN_REDUCE_TENSOR_ADD},
{TI(ngraph::op::Multiply), CUDNN_REDUCE_TENSOR_MUL},
{TI(ngraph::op::Maximum), CUDNN_REDUCE_TENSOR_MAX},
{TI(ngraph::op::Minimum), CUDNN_REDUCE_TENSOR_MIN},
};
// cudnn support elementwised op
// CUDNN_OP_TENSOR_ADD
// CUDNN_OP_TENSOR_MUL
// CUDNN_OP_TENSOR_MIN
// CUDNN_OP_TENSOR_MAX
// CUDNN_OP_TENSOR_SQRT
// CUDNN_OP_TENSOR_NOT
static const std::unordered_map<std::type_index, cudnnOpTensorOp_t> element_op_map{
{TI(ngraph::op::Add), CUDNN_OP_TENSOR_ADD},
{TI(ngraph::op::Multiply), CUDNN_OP_TENSOR_MUL},
{TI(ngraph::op::Maximum), CUDNN_OP_TENSOR_MAX},
{TI(ngraph::op::Minimum), CUDNN_OP_TENSOR_MIN},
{TI(ngraph::op::Sqrt), CUDNN_OP_TENSOR_SQRT}};
namespace ngraph namespace ngraph
{ {
namespace runtime namespace runtime
...@@ -1281,6 +1316,87 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc, ...@@ -1281,6 +1316,87 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
return; return;
} }
template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::Reduce)
{
const ngraph::op::Reduce* reduce_op = static_cast<const ngraph::op::Reduce*>(node);
writer.block_begin(" // " + node->get_name());
{
if (out[0].get_size() != 0)
{
// one of args0 axes has zero size, zero output, use args1 value
if (args[0].get_size() == 0)
{
writer << "float init_value;\n";
writer << "runtime::gpu::cuda_memcpyDtH(&init_value, "
<< args[1].get_name() << " ,"
<< args[1].get_element_type().size() << ");\n";
writer << "std::vector<float> temp(" << out[0].get_size()
<< ", init_value);\n";
writer << "runtime::gpu::cuda_memcpyHtD(" << out[0].get_name()
<< ", (void*)temp.data(), " << out[0].get_size() << " * "
<< out[0].get_element_type().size() << ");\n";
}
else if (args[0].get_shape().size() == out[0].get_shape().size())
{
kernel::emit_memcpyDtD(writer, out[0], args[0]);
}
else
{
// in current implementation:
// 1. reduction function should only have one op
// 2. the op should be in the op_map
// otherwise, throw an error message
auto reduction_function_ops = reduce_op->get_functions()[0]->get_ops();
cudnnReduceTensorOp_t reduce_tensor_op;
int op_count = 0;
for (auto op : reduction_function_ops)
{
if (op->is_constant() || op->is_parameter() || op->is_output())
{
continue;
}
op_count++;
// Work around a compiler warning (*node inside typeid may have effects
// with shared pointers, which is fine here but clang doesn't like it.)
auto& fn = *op;
auto f_ptr = reduce_map.find(type_index(typeid(fn)));
if (f_ptr == reduce_map.end())
{
throw std::runtime_error("reduce with function " +
fn.get_name() +
" is not implement yet.");
}
else if (op_count != 1)
{
throw std::runtime_error(
"reduce with more than one op is not implement yet.");
}
else
{
reduce_tensor_op = f_ptr->second;
}
}
auto& cudnn_emitter =
external_function->get_primitive_emitter()->get_cudnn_emitter();
auto reduce_index = cudnn_emitter->build_reduce_forward(
external_function->ctx().get(),
reduce_tensor_op,
args[0].get_shape(),
reduce_op->get_reduction_axes());
writer << "gpu::invoke_primitive(ctx, " << reduce_index << ", ";
writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
writer << ");\n";
}
}
}
writer.block_end();
return;
}
template <> template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::Pad) void GPU_Emitter::EMITTER_DECL(ngraph::op::Pad)
{ {
......
...@@ -118,15 +118,6 @@ product_to_scalar ...@@ -118,15 +118,6 @@ product_to_scalar
product_trivial product_trivial
product_trivial_5d product_trivial_5d
product_vector_zero product_vector_zero
reduce_3d_to_vector
reduce_matrix_cols_zero
reduce_matrix_columns
reduce_matrix_rows
reduce_matrix_rows_zero
reduce_matrix_to_scalar_zero_by_zero
reduce_to_scalar
reduce_trivial
reduce_vector_zero
reduce_window_emulating_max_pool_1d_1channel_1image reduce_window_emulating_max_pool_1d_1channel_1image
reduce_window_emulating_max_pool_1d_1channel_2image reduce_window_emulating_max_pool_1d_1channel_2image
reduce_window_emulating_max_pool_1d_2channel_2image reduce_window_emulating_max_pool_1d_2channel_2image
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment