IntelGPU backend: Sum operation optimization (#1545)

* IntelGPU backend: Sum operation optimization * PR1545. Comments addressed. Test added. Helper function refactored.

IntelGPU backend: Sum operation optimization (#1545)
* IntelGPU backend: Sum operation optimization * PR1545. Comments addressed. Test added. Helper function refactored.
ed22bf6c · shssf · Robert Kimball · 75a18827 · ed22bf6c · ed22bf6c
Commit ed22bf6c authored Sep 04, 2018 by shssf Committed by Robert Kimball Sep 04, 2018
5 changed files
--- a/src/ngraph/runtime/intelgpu/intelgpu_op_broadcast.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_broadcast.cpp
@@ -59,43 +59,16 @@ void runtime::intelgpu::do_bcast_sum_operation(cldnn::topology& topology,
        }
        else
        {
-            gws = {1}; // non parallel version
+            const string reduction_str =
-            // Initialize destination output by zeroes
+                "output" + access_dims(input_shape, "i", axis) + " = result;\n";
-            size_t var_idx = 0;
-            for (auto const& i : output_shape)
-            {
-                writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
-                       << var_idx << ")\n";
-                writer.block_begin();
-                ++var_idx;
-            }
-            writer << "output" << access_dims(output_shape) << " = 0.0f;\n";
+            // Generate loops related to input order with GWS
+            gws = generate_loops_w_axes(writer, input_shape, true, axis, "float result = 0.0f;\n");
-            // Closing brackets for Sum initialization loop
-            for (auto const& i : output_shape)
-            {
-                writer.block_end();
-            }
-            // Now do the Sum operation
-            var_idx = 0;
-            for (auto const& i : input_shape)
-            {
-                writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
-                       << var_idx << ")\n";
-                writer.block_begin();
-                ++var_idx;
-            }
-            writer << "output" << access_dims(input_shape, "i", axis) << " += input0"
+            writer << "result += input0" << access_dims(input_shape) << ";\n";
-                   << access_dims(input_shape) << ";\n";
-            // Closing brackets for Sum loop
+            // Close brackets related to input order with reduction
-            for (auto const& i : input_shape)
+            generate_loops_w_axes(writer, input_shape, false, axis, reduction_str);
-            {
-                writer.block_end();
-            }
        }
    } // End of function bracket
    writer.block_end();

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
@@ -159,6 +159,99 @@ vector<size_t> runtime::intelgpu::generate_loops(codegen::CodeWriter& writer,
    return gws;
 }
+vector<size_t> runtime::intelgpu::generate_loops_w_axes(codegen::CodeWriter& writer,
+                                                        const Shape& shape,
+                                                        bool is_begin,
+                                                        const AxisSet& axis,
+                                                        const string& expression)
+{
+    const size_t cldnn_gws_lim = 3;
+    vector<size_t> gws;
+    size_t var_idx = 0;
+    size_t dim_idx = 0;
+    if (is_begin)
+    {
+        for (auto const& i : shape)
+        {
+            if (axis.find(var_idx) == axis.end())
+            {
+                if (dim_idx < cldnn_gws_lim)
+                {
+                    writer << "const unsigned i" << var_idx << " = get_global_id(" << dim_idx
+                           << "); /* trip count " << i << "*/\n";
+                    gws.push_back(i);
+                    ++dim_idx;
+                }
+                else
+                {
+                    writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i
+                           << "; ++i" << var_idx << ")\n";
+                    writer.block_begin();
+                }
+            }
+            ++var_idx;
+        }
+        if (!expression.empty())
+        {
+            writer << expression;
+        }
+        var_idx = 0;
+        for (auto const& i : shape)
+        {
+            if (axis.find(var_idx) != axis.end())
+            {
+                writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
+                       << var_idx << ")\n";
+                writer.block_begin();
+            }
+            ++var_idx;
+        }
+    }
+    else
+    { // is_begin == false
+        for (auto const& i : shape)
+        {
+            if (axis.find(var_idx) != axis.end())
+            {
+                writer.block_end();
+            }
+            ++var_idx;
+        }
+        if (!expression.empty())
+        {
+            writer << expression;
+        }
+        var_idx = 0;
+        for (auto const& i : shape)
+        {
+            if (axis.find(var_idx) == axis.end())
+            {
+                if (dim_idx < cldnn_gws_lim)
+                {
+                    ++dim_idx;
+                }
+                else
+                {
+                    writer.block_end();
+                }
+            }
+            ++var_idx;
+        }
+    }
+    if (gws.empty())
+    {
+        gws.push_back(1);
+    }
+    return gws;
+}
 static string access_dims_strided(const Shape& dimentions,
                                  const Shape& pad_below,
                                  const Shape& pad_interior,

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp
@@ -157,6 +157,12 @@ namespace ngraph
                                    bool is_reversed = false);
            std::vector<size_t>
                generate_loops(codegen::CodeWriter& writer, const Shape& shape, bool is_begin);
+            std::vector<size_t>
+                generate_loops_w_axes(codegen::CodeWriter& writer,
+                                      const Shape& shape,
+                                      bool is_begin,
+                                      const AxisSet& axis = {},
+                                      const std::string& expression = std::string());
            void gen_func_def(codegen::CodeWriter& writer,
                              const std::string& entry_point_name,
                              const std::vector<std::string>& input_types,

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_softmax.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_softmax.cpp
@@ -45,80 +45,6 @@ static Shape shape_dims(const Shape& dimentions, const AxisSet& axis = {})
    return output_shape;
 }
-static vector<size_t> generate_loops_w_axes(codegen::CodeWriter& writer,
-                                            const Shape& shape,
-                                            bool is_begin,
-                                            const AxisSet& axis,
-                                            const string& expression)
-{
-    const size_t cldnn_gws_lim = 3;
-    vector<size_t> gws;
-    size_t var_idx = 0;
-    size_t dim_idx = 0;
-    for (auto const& i : shape)
-    {
-        if (axis.find(var_idx) == axis.end())
-        {
-            if (dim_idx < cldnn_gws_lim)
-            {
-                if (is_begin)
-                {
-                    writer << "const unsigned i" << var_idx << " = get_global_id(" << dim_idx
-                           << ");\n";
-                    gws.push_back(i);
-                }
-                ++dim_idx;
-            }
-            else
-            {
-                if (is_begin)
-                {
-                    writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i
-                           << "; ++i" << var_idx << ")\n";
-                    writer.block_begin();
-                }
-                else
-                {
-                    writer.block_end();
-                }
-            }
-        }
-        ++var_idx;
-    }
-    if (is_begin)
-    {
-        writer << expression;
-    }
-    var_idx = 0;
-    for (auto const& i : shape)
-    {
-        if (axis.find(var_idx) != axis.end())
-        {
-            if (is_begin)
-            {
-                writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
-                       << var_idx << ")\n";
-                writer.block_begin();
-            }
-            else
-            {
-                writer.block_end();
-            }
-        }
-        ++var_idx;
-    }
-    if (gws.empty())
-    {
-        gws.push_back(1);
-    }
-    return gws;
-}
 void runtime::intelgpu::do_softmax_operation(cldnn::topology& topology,
                                             const string& input_name,
                                             const Shape& input_shape,

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -3524,6 +3524,33 @@ NGRAPH_TEST(${BACKEND_NAME}, sum_matrix_columns)
    EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6}), read_vector<float>(a));
 }
+NGRAPH_TEST(${BACKEND_NAME}, sum_matrix_6d)
+{
+    Shape shape_a{2, 6, 4, 5, 7, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_rt{2, 4, 5, 3};
+    auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{1, 4}), op::ParameterVector{A});
+    auto backend_wrk = runtime::Backend::create("${BACKEND_NAME}");
+    auto backend_ref = runtime::Backend::create("INTERPRETER");
+    // Create some tensors for input/output
+    auto a_wrk = backend_wrk->create_tensor(element::f32, shape_a);
+    auto a_ref = backend_ref->create_tensor(element::f32, shape_a);
+    auto result_wrk = backend_wrk->create_tensor(element::f32, shape_rt);
+    auto result_ref = backend_ref->create_tensor(element::f32, shape_rt);
+    vector<float> inp_data(shape_size<const Shape>(shape_a));
+    iota(inp_data.begin(), inp_data.end(), 1);
+    copy_data(a_wrk, inp_data);
+    copy_data(a_ref, inp_data);
+    backend_wrk->call_with_validate(f, {result_wrk}, {a_wrk});
+    backend_ref->call_with_validate(f, {result_ref}, {a_ref});
+    EXPECT_EQ(read_vector<float>(result_ref), read_vector<float>(result_wrk));
+}
 NGRAPH_TEST(${BACKEND_NAME}, sum_matrix_rows)
 {
    Shape shape_a{3, 2};