Add cudnn sum and test that exposes tf test failure for nvGPU (#2033)

* Add cudnn sum and test that exposes failure * Add test to interpreter manifest and move test to sum file * change test name * Fails INTELGPU and no need to run on CPU * Comment incorporation + kahan summation for interpreter * small oversight correction * resolve conflict * minor variable name edit

Add cudnn sum and test that exposes tf test failure for nvGPU (#2033)
* Add cudnn sum and test that exposes failure * Add test to interpreter manifest and move test to sum file * change test name * Fails INTELGPU and no need to run on CPU * Comment incorporation + kahan summation for interpreter * small oversight correction * resolve conflict * minor variable name edit
cccdc304 · Ayan Moitra · Scott Cyphers · 76b8b4d4 · cccdc304 · cccdc304
Commit cccdc304 authored Nov 13, 2018 by Ayan Moitra Committed by Scott Cyphers Nov 13, 2018
5 changed files
--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -1563,6 +1563,15 @@ void runtime::gpu::GPU_Emitter::emit_Subtract(EMIT_ARGS)
 }
 void runtime::gpu::GPU_Emitter::emit_Sum(EMIT_ARGS)
+{
+    runtime::gpu::GPU_Emitter::emit_Sum_1(external_function, writer, node, args, out);
+}
+void runtime::gpu::GPU_Emitter::emit_Sum_0(EMIT_ARGS)
+/* emit_Sum_0 uses native cuda kernels to perform Sum reduction. This method
+is faster than cudnn implementation but in its current state is less precise
+than cudnn reduce. That is causing tensorflow tests aimed at testing stabilty
+to fail */
 {
    const ngraph::op::Sum* sum = static_cast<const ngraph::op::Sum*>(node);
    writer.block_begin();
@@ -1602,6 +1611,49 @@ void runtime::gpu::GPU_Emitter::emit_Sum(EMIT_ARGS)
    writer.block_end();
 }
+void runtime::gpu::GPU_Emitter::emit_Sum_1(EMIT_ARGS)
+/* emit_Sum_1 uses cudnn to perform Sum reduction. This method, although
+slower than the native cuda implementation is more precise and fixes the issue with
+tensorflow test failures*/
+{
+    const ngraph::op::Sum* sum = static_cast<const ngraph::op::Sum*>(node);
+    std::vector<element::Type> dtypes{args[0].get_element_type(), out[0].get_element_type()};
+    cudnnReduceTensorOp_t reduce_op = CUDNN_REDUCE_TENSOR_ADD;
+    writer.block_begin();
+    {
+        if (out[0].get_size() != 0)
+        {
+            // one of args[] axes has zero size, zero output
+            if (args[0].get_size() == 0)
+            {
+                kernel::emit_memset(writer, out[0], 0);
+            }
+            else if (args[0].get_size() == out[0].get_size())
+            {
+                kernel::emit_memcpyDtD(writer, out[0], args[0]);
+            }
+            else
+            {
+                auto& cudnn_emitter =
+                    external_function->get_primitive_emitter()->get_cudnn_emitter();
+                auto sum_index =
+                    cudnn_emitter->build_reduce_forward(reduce_op,
+                                                        dtypes,
+                                                        args[0].get_shape(),
+                                                        sum->get_reduction_axes(),
+                                                        CUDNNEmitter::ReductionMode::Reduce);
+                writer << "gpu::invoke_primitive(ctx, " << sum_index << ", ";
+                writer << "std::vector<void*>{" << args[0].get_name() << "}.data(), ";
+                writer << "std::vector<void*>{" << out[0].get_name() << "}.data()";
+                writer << ");\n";
+            }
+        }
+    }
+    writer.block_end();
+}
 void runtime::gpu::GPU_Emitter::emit_Tan(EMIT_ARGS)
 {
    emit_elementwise<ngraph::op::Tan>(external_function, writer, node, args, out);

--- a/src/ngraph/runtime/gpu/gpu_emitter.hpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.hpp
@@ -76,6 +76,8 @@ namespace ngraph
                }
                static void emit_ArgReduce(EMIT_ARGS, cudnnReduceTensorOp_t);
+                static void emit_Sum_0(EMIT_ARGS);
+                static void emit_Sum_1(EMIT_ARGS);
                /// \brief Create a list of node names for each arg in args
                /// \param args list of tensor arguments

--- a/src/ngraph/runtime/intelgpu/unit_test.manifest
+++ b/src/ngraph/runtime/intelgpu/unit_test.manifest
@@ -119,3 +119,5 @@ shape_of_scalar
 shape_of_vector
 shape_of_matrix
 shape_of_5d
+sum_stable_acc
--- a/src/ngraph/runtime/reference/sum.hpp
+++ b/src/ngraph/runtime/reference/sum.hpp
@@ -43,12 +43,14 @@ namespace ngraph
                CoordinateTransform input_transform(in_shape);
+                T c = 0;
                for (const Coordinate& input_coord : input_transform)
                {
                    Coordinate output_coord = reduce(input_coord, reduction_axes);
+                    T y = arg[input_transform.index(input_coord)] - c;
-                    out[output_transform.index(output_coord)] +=
+                    T t = out[output_transform.index(output_coord)] + y;
-                        arg[input_transform.index(input_coord)];
+                    c = (t - out[output_transform.index(output_coord)]) - y;
+                    out[output_transform.index(output_coord)] = t;
                }
            }
        }

--- a/test/backend_sum.in.cpp
+++ b/test/backend_sum.in.cpp
@@ -427,3 +427,38 @@ NGRAPH_TEST(${BACKEND_NAME}, sum_5d_to_scalar)
    backend->call_with_validate(f, {result}, {a});
    EXPECT_EQ(std::vector<float>{243.}, read_vector<float>(result));
 }
+#if NGRAPH_INTERPRETER_ENABLE
+NGRAPH_TEST(${BACKEND_NAME}, sum_stable_acc)
+{
+    std::string backend_name = "${BACKEND_NAME}";
+    if (backend_name == "INTERPRETER")
+    {
+        exit(0);
+    }
+    Shape shape_a{10, 10, 10, 30};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{10};
+    auto f =
+        make_shared<Function>(make_shared<op::Sum>(A, AxisSet{1, 2, 3}), op::ParameterVector{A});
+    test::Uniform<float> rng(1000.0f, 1000.1f, 2112);
+    vector<vector<float>> args;
+    for (shared_ptr<op::Parameter> param : f->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+    auto ref_func = clone_function(*f);
+    auto bk_func = clone_function(*f);
+    auto ref_results = execute(ref_func, args, "INTERPRETER");
+    auto bk_results = execute(bk_func, args, "${BACKEND_NAME}");
+    EXPECT_TRUE(test::all_close_f(ref_results.at(0), bk_results.at(0), 24, 3));
+}
+#endif