add gpu reduction (#985)

* add reduce op * fix bug * fix bug and enable tests * hack solution to get reduction function in reduct op * hack version working on all tests * fixed the reduction checking process

add gpu reduction (#985)
* add reduce op * fix bug * fix bug and enable tests * hack solution to get reduction function in reduct op * hack version working on all tests * fixed the reduction checking process
b74742ec · Fenglei · Robert Kimball · c2b0b066 · b74742ec · b74742ec
Commit b74742ec authored May 14, 2018 by Fenglei Committed by Robert Kimball May 14, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 116 additions and 9 deletions

gpu_emitter.cpp src/ngraph/runtime/gpu/gpu_emitter.cpp +116 -0

unit_test.manifest src/ngraph/runtime/gpu/unit_test.manifest +0 -9

No files found.
--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -100,6 +100,41 @@
 #include "ngraph/util.hpp"

 using namespace std;
+
+// reduction function supported by GPU
+// CUDNN_REDUCE_TENSOR_ADD
+// CUDNN_REDUCE_TENSOR_MUL
+// CUDNN_REDUCE_TENSOR_MIN
+// CUDNN_REDUCE_TENSOR_MAX
+// CUDNN_REDUCE_TENSOR_AMAX
+// CUDNN_REDUCE_TENSOR_AVG
+// CUDNN_REDUCE_TENSOR_NORM1
+// CUDNN_REDUCE_TENSOR_NORM2
+// CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS
+#define TI(x) type_index(typeid(x))
+
+static const std::unordered_map<std::type_index, cudnnReduceTensorOp_t> reduce_map{
+    {TI(ngraph::op::Add), CUDNN_REDUCE_TENSOR_ADD},
+    {TI(ngraph::op::Multiply), CUDNN_REDUCE_TENSOR_MUL},
+    {TI(ngraph::op::Maximum), CUDNN_REDUCE_TENSOR_MAX},
+    {TI(ngraph::op::Minimum), CUDNN_REDUCE_TENSOR_MIN},
+};
+
+// cudnn support elementwised op
+// CUDNN_OP_TENSOR_ADD
+// CUDNN_OP_TENSOR_MUL
+// CUDNN_OP_TENSOR_MIN
+// CUDNN_OP_TENSOR_MAX
+// CUDNN_OP_TENSOR_SQRT
+// CUDNN_OP_TENSOR_NOT
+
+static const std::unordered_map<std::type_index, cudnnOpTensorOp_t> element_op_map{
+    {TI(ngraph::op::Add), CUDNN_OP_TENSOR_ADD},
+    {TI(ngraph::op::Multiply), CUDNN_OP_TENSOR_MUL},
+    {TI(ngraph::op::Maximum), CUDNN_OP_TENSOR_MAX},
+    {TI(ngraph::op::Minimum), CUDNN_OP_TENSOR_MIN},
+    {TI(ngraph::op::Sqrt), CUDNN_OP_TENSOR_SQRT}};
+
 namespace ngraph
 {
    namespace runtime
@@ -1281,6 +1316,87 @@ CUDNN_SAFE_CALL(cudnnSetOpTensorDescriptor(opTensorDesc,
                return;
            }

+            template <>
+            void GPU_Emitter::EMITTER_DECL(ngraph::op::Reduce)
+            {
+                const ngraph::op::Reduce* reduce_op = static_cast<const ngraph::op::Reduce*>(node);
+                writer.block_begin("  // " + node->get_name());
+                {
+                    if (out[0].get_size() != 0)
+                    {
+                        // one of args0 axes has zero size, zero output, use args1 value
+                        if (args[0].get_size() == 0)
+                        {
+                            writer << "float init_value;\n";
+                            writer << "runtime::gpu::cuda_memcpyDtH(&init_value, "
+                                   << args[1].get_name() << " ,"
+                                   << args[1].get_element_type().size() << ");\n";
+                            writer << "std::vector<float> temp(" << out[0].get_size()
+                                   << ", init_value);\n";
+                            writer << "runtime::gpu::cuda_memcpyHtD(" << out[0].get_name()
+                                   << ", (void*)temp.data(), " << out[0].get_size() << " * "
+                                   << out[0].get_element_type().size() << ");\n";
+                        }
+                        else if (args[0].get_shape().size() == out[0].get_shape().size())
+                        {
+                            kernel::emit_memcpyDtD(writer, out[0], args[0]);
+                        }
+                        else
+                        {
+                            // in current implementation:
+                            // 1. reduction function should only have one op
+                            // 2. the op should be in the op_map
+                            // otherwise, throw an error message
+                            auto reduction_function_ops = reduce_op->get_functions()[0]->get_ops();
+                            cudnnReduceTensorOp_t reduce_tensor_op;
+                            int op_count = 0;
+                            for (auto op : reduction_function_ops)
+                            {
+                                if (op->is_constant() || op->is_parameter() || op->is_output())
+                                {
+                                    continue;
+                                }
+                                op_count++;
+                                // Work around a compiler warning (*node inside typeid may have effects
+                                // with shared pointers, which is fine here but clang doesn't like it.)
+                                auto& fn = *op;
+                                auto f_ptr = reduce_map.find(type_index(typeid(fn)));
+                                if (f_ptr == reduce_map.end())
+                                {
+                                    throw std::runtime_error("reduce with function " +
+                                                             fn.get_name() +
+                                                             " is not implement yet.");
+                                }
+                                else if (op_count != 1)
+                                {
+                                    throw std::runtime_error(
+                                        "reduce with more than one op is not implement yet.");
+                                }
+                                else
+                                {
+                                    reduce_tensor_op = f_ptr->second;
+                                }
+                            }
+
+                            auto& cudnn_emitter =
+                                external_function->get_primitive_emitter()->get_cudnn_emitter();
+                            auto reduce_index = cudnn_emitter->build_reduce_forward(
+                                external_function->ctx().get(),
+                                reduce_tensor_op,
+                                args[0].get_shape(),
+                                reduce_op->get_reduction_axes());
+
+                            writer << "gpu::invoke_primitive(ctx, " << reduce_index << ", ";
+                            writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
+                            writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
+                            writer << ");\n";
+                        }
+                    }
+                }
+                writer.block_end();
+                return;
+            }
+
            template <>
            void GPU_Emitter::EMITTER_DECL(ngraph::op::Pad)
            {

--- a/src/ngraph/runtime/gpu/unit_test.manifest
+++ b/src/ngraph/runtime/gpu/unit_test.manifest
@@ -118,15 +118,6 @@ product_to_scalar
 product_trivial
 product_trivial_5d
 product_vector_zero
-reduce_3d_to_vector
-reduce_matrix_cols_zero
-reduce_matrix_columns
-reduce_matrix_rows
-reduce_matrix_rows_zero
-reduce_matrix_to_scalar_zero_by_zero
-reduce_to_scalar
-reduce_trivial
-reduce_vector_zero
 reduce_window_emulating_max_pool_1d_1channel_1image
 reduce_window_emulating_max_pool_1d_1channel_2image
 reduce_window_emulating_max_pool_1d_2channel_2image