Support ArgMin and ArgMax for NVGPU Backend (#1737)

* Project initialization commit * Added unit tests for 3D tensors for argmax * Refactored reduce to be used by argmax argmin. argmax argmin still has some issues. WIP * [WIP]First working version of ArgMax ArgMin * added reduce buffer for the cudnn api calls * added reduce buffer for the cudnn api calls * Further modifications. Using rvalues to pass enums to build reduce method * more unit tests added * Incorporate Fenglei's comments * Incorporating Chris's first set of comments * small change to test file * Resolving clang issue that was causing argmin test to fail * Incorporate Chris's comments * clang format issue

Support ArgMin and ArgMax for NVGPU Backend (#1737)
* Project initialization commit * Added unit tests for 3D tensors for argmax * Refactored reduce to be used by argmax argmin. argmax argmin still has some issues. WIP * [WIP]First working version of ArgMax ArgMin * added reduce buffer for the cudnn api calls * added reduce buffer for the cudnn api calls * Further modifications. Using rvalues to pass enums to build reduce method * more unit tests added * Incorporate Fenglei's comments * Incorporating Chris's first set of comments * small change to test file * Resolving clang issue that was causing argmin test to fail * Incorporate Chris's comments * clang format issue
6f30b32b · Ayan Moitra · Scott Cyphers · 2f49032f · 6f30b32b · 6f30b32b
Commit 6f30b32b authored Oct 12, 2018 by Ayan Moitra Committed by Scott Cyphers Oct 12, 2018
7 changed files
--- a/src/ngraph/runtime/gpu/cuda_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cuda_emitter.cpp
@@ -1783,8 +1783,9 @@ size_t runtime::gpu::CUDAEmitter::build_primitive(const op::Softmax* node)
    auto output_type = out[0].get_element_type().c_type_string();
    auto exp_index = build_elementwise<ngraph::op::Exp>({input_type, output_type}, input_shape);
+    std::vector<element::Type> dtypes{args[0].get_element_type(), out[0].get_element_type()};
    auto reduce_index = cudnn_emitter->build_reduce_forward(
-        CUDNN_REDUCE_TENSOR_ADD, output_type, input_shape, axes);
+        CUDNN_REDUCE_TENSOR_ADD, dtypes, input_shape, axes, CUDNNEmitter::ReductionMode::Reduce);
    size_t divide_index = build_softmax_divide(
        std::vector<std::string>(3, output_type), input_shape, reduced_shape, axes_flag);

--- a/src/ngraph/runtime/gpu/cudnn_emitter.cpp
+++ b/src/ngraph/runtime/gpu/cudnn_emitter.cpp
--- a/src/ngraph/runtime/gpu/cudnn_emitter.hpp
+++ b/src/ngraph/runtime/gpu/cudnn_emitter.hpp
@@ -72,6 +72,12 @@ namespace ngraph
                    Backward
                };
+                enum class ReductionMode
+                {
+                    Reduce,
+                    ArgReduce
+                };
                enum class algo_search
                {
                    HEURISTIC,
@@ -109,9 +115,10 @@ namespace ngraph
                    const algo_search find_algo = algo_search::NONE);
                size_t build_reduce_forward(const cudnnReduceTensorOp_t& reduce_op,
-                                            const std::string& dtype,
+                                            const std::vector<element::Type>& dtypes,
                                            const Shape& input_shape,
-                                            const AxisSet& reduction_axes);
+                                            const AxisSet& reduction_axes,
+                                            const ReductionMode& reduction_mode);
                size_t build_tensor_op(const cudnnOpTensorOp_t& tensor_op,
                                       const std::string& dtype,
@@ -163,6 +170,7 @@ namespace ngraph
                void* get_data_by_type(cudnnDataType_t data_type, double value);
                cudnnDataType_t get_cudnn_datatype(std::string dtype);
+                cudnnDataType_t get_cudnn_datatype(const element::Type& dtype);
                cudnnTensorDescriptor_t&
                    tensor_descriptor_from_shape(const Shape& shape,

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -164,12 +164,45 @@ void runtime::gpu::GPU_Emitter::emit_And(EMIT_ARGS)
 void runtime::gpu::GPU_Emitter::emit_ArgMax(EMIT_ARGS)
 {
-    throw unsupported_op("Unsupported op '" + node->description() + "'");
+    cudnnReduceTensorOp_t reduce_op = CUDNN_REDUCE_TENSOR_MAX;
+    runtime::gpu::GPU_Emitter::emit_ArgReduce(
+        external_function, writer, node, args, out, reduce_op);
 }
 void runtime::gpu::GPU_Emitter::emit_ArgMin(EMIT_ARGS)
 {
-    throw unsupported_op("Unsupported op '" + node->description() + "'");
+    cudnnReduceTensorOp_t reduce_op = CUDNN_REDUCE_TENSOR_MIN;
+    runtime::gpu::GPU_Emitter::emit_ArgReduce(
+        external_function, writer, node, args, out, reduce_op);
+}
+void runtime::gpu::GPU_Emitter::emit_ArgReduce(EMIT_ARGS, cudnnReduceTensorOp_t reduce_mode)
+{
+    if (out[0].get_size() == 0)
+    {
+        return;
+    }
+    auto argmax = static_cast<const ngraph::op::ArgMax*>(node);
+    std::vector<size_t> axes{argmax->get_reduction_axis()};
+    auto axis_set = AxisSet(axes);
+    std::vector<element::Type> dtypes{args[0].get_element_type(), out[0].get_element_type()};
+    writer.block_begin();
+    {
+        auto& cudnn_emitter = external_function->get_primitive_emitter()->get_cudnn_emitter();
+        auto index = cudnn_emitter->build_reduce_forward(reduce_mode,
+                                                         dtypes,
+                                                         args[0].get_shape(),
+                                                         axis_set,
+                                                         CUDNNEmitter::ReductionMode::ArgReduce);
+        writer << "void* input[] = {" << node_names(args) << "};\n";
+        writer << "void* output[] = {" << node_names(out) << "};\n";
+        writer << "gpu::invoke_primitive(ctx, " << index << ", input, output);\n";
+    }
+    writer.block_end();
 }
 void runtime::gpu::GPU_Emitter::emit_Asin(EMIT_ARGS)
@@ -856,6 +889,7 @@ void runtime::gpu::GPU_Emitter::emit_Power(EMIT_ARGS)
 void runtime::gpu::GPU_Emitter::emit_Product(EMIT_ARGS)
 {
    const ngraph::op::Product* product = static_cast<const ngraph::op::Product*>(node);
    writer.block_begin();
    {
        if (out[0].get_size() != 0)
@@ -877,12 +911,16 @@ void runtime::gpu::GPU_Emitter::emit_Product(EMIT_ARGS)
            // descriptors for tensors  with <= 4 dimensions
            else
            {
+                std::vector<element::Type> dtypes{args[0].get_element_type(),
+                                                  out[0].get_element_type()};
                auto& cudnn_emitter =
                    external_function->get_primitive_emitter()->get_cudnn_emitter();
-                auto index = cudnn_emitter->build_reduce_forward(CUDNN_REDUCE_TENSOR_MUL,
+                auto index =
-                                                                 out[0].get_type(),
+                    cudnn_emitter->build_reduce_forward(CUDNN_REDUCE_TENSOR_MUL,
-                                                                 args[0].get_shape(),
+                                                        dtypes,
-                                                                 product->get_reduction_axes());
+                                                        args[0].get_shape(),
+                                                        product->get_reduction_axes(),
+                                                        CUDNNEmitter::ReductionMode::Reduce);
                writer << "void* input[] = {" << node_names(args) << "};\n";
                writer << "void* output[] = {" << node_names(out) << "};\n";
@@ -971,14 +1009,16 @@ void runtime::gpu::GPU_Emitter::emit_Reduce(EMIT_ARGS)
                        reduce_tensor_op = f_ptr->second;
                    }
                }
+                std::vector<element::Type> dtypes{args[0].get_element_type(),
+                                                  out[0].get_element_type()};
                auto& cudnn_emitter =
                    external_function->get_primitive_emitter()->get_cudnn_emitter();
                auto reduce_index =
                    cudnn_emitter->build_reduce_forward(reduce_tensor_op,
-                                                        out[0].get_type(),
+                                                        dtypes,
                                                        args[0].get_shape(),
-                                                        reduce_op->get_reduction_axes());
+                                                        reduce_op->get_reduction_axes(),
+                                                        CUDNNEmitter::ReductionMode::Reduce);
                writer << "void* input[] = {" << node_names(args) << "};\n";
                writer << "void* output[] = {" << node_names(out) << "};\n";

--- a/src/ngraph/runtime/gpu/gpu_emitter.hpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.hpp
@@ -75,6 +75,8 @@ namespace ngraph
                    writer.block_end();
                }
+                static void emit_ArgReduce(EMIT_ARGS, cudnnReduceTensorOp_t);
            private:
                /// \brief Create a list of node names for each arg in args
                /// \param args list of tensor arguments

--- a/src/ngraph/runtime/gpu/unit_test.manifest
+++ b/src/ngraph/runtime/gpu/unit_test.manifest
@@ -31,8 +31,6 @@ backwards_avgpool_n1_c1_hw4x4
 backwards_avgpool_n2_c2_hw4x4
 max_pool_3d
 avg_pool_3d
-argmin_trivial
-argmax_trivial
 topk_1d_max_all
 topk_1d_max_partial
 topk_1d_max_one

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -9483,9 +9483,51 @@ NGRAPH_TEST(${BACKEND_NAME}, argmin_trivial)
    EXPECT_EQ((vector<int>{3, 2, 1}), read_vector<int>(result));
 }
+NGRAPH_TEST(${BACKEND_NAME}, argmin_4D_axis_3)
+{
+    Shape shape{2, 2, 5, 5}; // NCHW ->(0,1,2,3)
+    Shape rshape{2, 2, 5};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f =
+        make_shared<Function>(make_shared<op::ArgMin>(A, 3, element::i32), op::ParameterVector{A});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a,
+              test::NDArray<float, 4>({{{{0.5f, 1.5f, 0.8f, 2.9f, 1.05f}, // img 0 ch 0
+                                         {0.5f, 3.5f, 2.0f, 1.0f, 0.2f},
+                                         {2.0f, 0.0f, 2.2f, 0.2f, 1.4f},
+                                         {2.9f, 0.0f, 1.52f, 1.2f, 2.22f},
+                                         {5.0f, 2.0f, 1.0f, 0.5f, 0.85f}},
+                                        {{0.25f, 0.02f, 0.02f, 2.2f, 0.001f}, // img 0 ch 1
+                                         {1.0f, 0.2f, 3.0f, 0.25f, 1.14f},
+                                         {2.25f, 10.1f, 1.0f, 0.02f, 2.22f},
+                                         {3.2f, 1.002f, 0.001f, 0.2f, 6.0f},
+                                         {2.0f, 0.0f, 0.0f, 0.0f, 0.0f}}},
+                                       {{{0.0f, 2.2f, 1.2f, 1.6f, 0.2f}, // img 1 ch 0
+                                         {0.01f, 0.0f, 0.22f, 0.02f, 1.1f},
+                                         {0.01f, 0.5f, 1.6f, 0.2f, 3.2f},
+                                         {2.4f, 0.5f, 0.0f, 3.0f, 0.1f},
+                                         {0.0f, 0.5f, 0.4f, 0.8f, 1.0f}},
+                                        {{2.0f, 1.0f, 0.0f, 0.0f, 1.0f}, // img 1 ch 1
+                                         {0.0f, 2.0f, 0.0f, 0.0f, 0.0f},
+                                         {1.0f, 1.0f, 2.0f, 0.0f, 2.0f},
+                                         {1.0f, 1.0f, 1.0f, 0.0f, 1.0f},
+                                         {1.0f, 0.0f, 0.0f, 0.0f, 2.0f}}}})
+                  .get_vector());
+    auto result = backend->create_tensor(element::i32, rshape);
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((test::NDArray<int, 3>({{{0, 4, 1, 1, 3},   // ch0
+                                       {4, 1, 3, 2, 1}},  //
+                                      {{0, 1, 0, 2, 0},   // ch1
+                                       {2, 0, 3, 3, 1}}}) //
+                   .get_vector()),
+              read_vector<int>(result));
+}
 NGRAPH_TEST(${BACKEND_NAME}, argmax_trivial)
 {
-    Shape shape{4, 3};
+    Shape shape{4, 3}; // HW -> (0,1)
    Shape rshape{3};
    auto A = make_shared<op::Parameter>(element::f32, shape);
    auto f =
@@ -9502,6 +9544,168 @@ NGRAPH_TEST(${BACKEND_NAME}, argmax_trivial)
    EXPECT_EQ((vector<int>{1, 3, 0}), read_vector<int>(result));
 }
+NGRAPH_TEST(${BACKEND_NAME}, argmax_3D_axis_0) // Along Channels
+{
+    Shape shape{3, 4, 2}; // CHW ->(0,1,2)
+    Shape rshape{4, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f =
+        make_shared<Function>(make_shared<op::ArgMax>(A, 0, element::i32), op::ParameterVector{A});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a,
+              test::NDArray<float, 3>({{{8, 4}, //ch0
+                                        {12, 10},
+                                        {2, 9},
+                                        {1, 5}},
+                                       {{6, 7}, //ch1
+                                        {11, 3},
+                                        {9, 2},
+                                        {10, 12}},
+                                       {{8, 4}, //ch2
+                                        {6, 1},
+                                        {5, 3},
+                                        {11, 7}}})
+                  .get_vector());
+    auto result = backend->create_tensor(element::i32, rshape);
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((test::NDArray<int, 2>({{0, 1},  //r0
+                                      {0, 0},  //r1
+                                      {1, 0},  //r2
+                                      {2, 1}}) //r3
+                   .get_vector()),
+              read_vector<int>(result));
+}
+NGRAPH_TEST(${BACKEND_NAME}, argmax_3D_axis_1) // Along Height
+{
+    Shape shape{3, 4, 2}; // CHW ->(0,1,2)
+    Shape rshape{3, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f =
+        make_shared<Function>(make_shared<op::ArgMax>(A, 1, element::i32), op::ParameterVector{A});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a,
+              test::NDArray<float, 3>({{{8, 4}, //ch0
+                                        {12, 10},
+                                        {2, 9},
+                                        {1, 5}},
+                                       {{6, 7}, //ch1
+                                        {11, 3},
+                                        {9, 2},
+                                        {10, 12}},
+                                       {{8, 4}, //ch2
+                                        {6, 1},
+                                        {5, 3},
+                                        {11, 7}}})
+                  .get_vector());
+    auto result = backend->create_tensor(element::i32, rshape);
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((test::NDArray<int, 2>({{1, 1}, //
+                                      {1, 3}, //
+                                      {3, 3}})
+                   .get_vector()),
+              read_vector<int>(result));
+}
+NGRAPH_TEST(${BACKEND_NAME}, argmax_3D_axis_2) // Along Width
+{
+    Shape shape{3, 4, 2}; // CHW ->(0,1,2)
+    Shape rshape{3, 4};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f =
+        make_shared<Function>(make_shared<op::ArgMax>(A, 2, element::i32), op::ParameterVector{A});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a,
+              test::NDArray<float, 3>({{{8, 4}, //ch0
+                                        {12, 10},
+                                        {2, 9},
+                                        {1, 5}},
+                                       {{6, 7}, //ch1
+                                        {11, 3},
+                                        {9, 2},
+                                        {10, 12}},
+                                       {{8, 4}, //ch2
+                                        {6, 1},
+                                        {5, 3},
+                                        {11, 7}}})
+                  .get_vector());
+    auto result = backend->create_tensor(element::i32, rshape);
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((test::NDArray<int, 2>({{0, 0, 1, 1},  //
+                                      {1, 0, 0, 1},  //
+                                      {0, 0, 0, 0}}) //
+                   .get_vector()),
+              read_vector<int>(result));
+}
+NGRAPH_TEST(${BACKEND_NAME}, argmax_4D_axis_3)
+{
+    Shape shape{2, 2, 5, 5}; // NCHW ->(0,1,2,3)
+    Shape rshape{2, 2, 5};
+    auto A = make_shared<op::Parameter>(element::f32, shape);
+    auto f =
+        make_shared<Function>(make_shared<op::ArgMax>(A, 3, element::i32), op::ParameterVector{A});
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape);
+    copy_data(a,
+              test::NDArray<float, 4>({{{{0, 1, 0, 2, 1}, // img 0 ch 0
+                                         {0, 3, 2, 0, 0},
+                                         {2, 0, 0, 0, 1},
+                                         {2, 0, 1, 1, 2},
+                                         {0, 2, 1, 0, 0}},
+                                        {{0, 0, 0, 2, 0}, // img 0 ch 1
+                                         {0, 2, 3, 0, 1},
+                                         {2, 0, 1, 0, 2},
+                                         {3, 1, 0, 0, 0},
+                                         {2, 0, 0, 0, 0}}},
+                                       {{{0, 2, 1, 1, 0}, // img 1 ch 0
+                                         {0, 0, 2, 0, 1},
+                                         {0, 0, 1, 2, 3},
+                                         {2, 0, 0, 3, 0},
+                                         {0, 0, 0, 0, 0}},
+                                        {{2, 1, 0, 0, 1}, // img 1 ch 1
+                                         {0, 2, 0, 0, 0},
+                                         {1, 1, 2, 0, 2},
+                                         {1, 1, 1, 0, 1},
+                                         {1, 0, 0, 0, 2}}}})
+                  .get_vector());
+    auto result = backend->create_tensor(element::i32, rshape);
+    backend->call_with_validate(f, {result}, {a});
+    EXPECT_EQ((test::NDArray<int, 3>({{{3, 1, 0, 0, 1}, {3, 2, 0, 0, 0}},  //ch0
+                                      {{1, 2, 4, 3, 0}, {0, 1, 2, 0, 4}}}) //ch1
+                   .get_vector()),
+              read_vector<int>(result));
+}
 NGRAPH_TEST(${BACKEND_NAME}, topk_1d_max_all)
 {
    Shape shape{6};