IntelGPU backend: Broadcast operation optimization (#1450)

* IntelGPU backend: Broadcast operation optimization * PR1450 style check fixed

IntelGPU backend: Broadcast operation optimization (#1450)
* IntelGPU backend: Broadcast operation optimization * PR1450 style check fixed
176e105b · shssf · Scott Cyphers · 0593746d · 176e105b · 176e105b
Commit 176e105b authored Aug 22, 2018 by shssf Committed by Scott Cyphers Aug 22, 2018
3 changed files
--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -525,16 +525,6 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
            {
                do_equal_propagation(topology, get_input_name(op), get_output_name(op));
            }
-            else if (get_input_shape(op).empty())
-            {
-                do_bcast_sum_operation_scalar(topology,
-                                              get_input_name(op),
-                                              get_input_shape(op),
-                                              get_output_name(op),
-                                              get_output_shape(op),
-                                              get_output_type(op),
-                                              true);
-            }
            else
            {
                do_bcast_sum_operation(topology,
@@ -558,16 +548,6 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
            {
                do_equal_propagation(topology, get_input_name(op), get_output_name(op));
            }
-            else if (get_output_shape(op).empty())
-            {
-                do_bcast_sum_operation_scalar(topology,
-                                              get_input_name(op),
-                                              get_input_shape(op),
-                                              get_output_name(op),
-                                              get_output_shape(op),
-                                              get_output_type(op),
-                                              false);
-            }
            else
            {
                do_bcast_sum_operation(topology,

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_broadcast.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_broadcast.cpp
@@ -28,54 +28,6 @@
 using namespace std;
 using namespace ngraph;

-void runtime::intelgpu::do_bcast_sum_operation_scalar(cldnn::topology& topology,
-                                                      const string& input_name,
-                                                      const Shape& input_shape,
-                                                      const string& output_name,
-                                                      const Shape& output_shape,
-                                                      const element::Type& output_type,
-                                                      bool is_bcast)
-{
-    string function_name = is_bcast ? "broadcast_scalar" : "sum_scalar";
-    function_name += output_name;
-    const size_t input_count =
-        is_bcast ? shape_size<Shape>(output_shape) : shape_size<Shape>(input_shape);
-    codegen::CodeWriter writer;
-
-    writer << "__kernel void " << function_name
-           << "(const __global float* input, __global float* output)\n";
-    writer.block_begin();
-    {
-        writer << "float sum = 0.f;\n"
-               << "for (uint i = 0; i < COUNT; ++i)\n";
-        writer.block_begin();
-
-        if (is_bcast)
-        {
-            writer << "output[i] = input[0];\n";
-            writer.block_end();
-        }
-        else
-        {
-            writer << "sum += input[i];\n";
-            writer.block_end();
-            writer << "output[0] = sum;\n";
-        }
-    } // End of function bracket
-    writer.block_end();
-
-    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
-    const cldnn::custom_gpu_primitive op_scalar(output_name,
-                                                {input_name},
-                                                {writer.get_code()},
-                                                function_name,
-                                                get_kernel_args(1, 1),
-                                                string("-DCOUNT=" + to_string(input_count)),
-                                                layout,
-                                                {1});
-    topology.add(op_scalar);
-}
-
 void runtime::intelgpu::do_bcast_sum_operation(cldnn::topology& topology,
                                               const string& input_name,
                                               const Shape& input_shape,
@@ -85,37 +37,29 @@ void runtime::intelgpu::do_bcast_sum_operation(cldnn::topology& topology,
                                               const AxisSet& axis,
                                               bool is_bcast)
 {
-    string function_name = is_bcast ? "broadcast" : "sum";
+    string function_name = is_bcast ? "broadcast_" : "sum_";
    function_name += output_name;
    codegen::CodeWriter writer;
+    vector<size_t> gws;

-    writer << "__kernel void " << function_name << "(const __global float input"
-           << array_dims(input_shape) << ", __global float output" << array_dims(output_shape)
-           << ")\n";
-
+    runtime::intelgpu::gen_func_def(
+        writer, function_name, {"float"}, {input_shape}, "float", output_shape);
    writer.block_begin();
    {
        if (is_bcast)
        {
-            size_t var_idx = 0;
-            for (auto const& i : output_shape)
-            {
-                writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
-                       << var_idx << ")\n";
-                writer.block_begin();
-                ++var_idx;
-            }
-            writer << "output" << access_dims(output_shape) << " = input"
+            // Broadcast loops
+            gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
+
+            writer << "output" << access_dims(output_shape) << " = input0"
                   << access_dims(output_shape, axis) << ";\n";

            // Closing brackets for Broadcast loop
-            for (auto const& i : output_shape)
-            {
-                writer.block_end();
-            }
+            runtime::intelgpu::generate_loops(writer, output_shape, false);
        }
        else
        {
+            gws = {1}; // non parallel version
            // Initialize destination output by zeroes
            size_t var_idx = 0;
            for (auto const& i : output_shape)
@@ -144,7 +88,7 @@ void runtime::intelgpu::do_bcast_sum_operation(cldnn::topology& topology,
                ++var_idx;
            }

-            writer << "output" << access_dims(input_shape, axis) << " += input"
+            writer << "output" << access_dims(input_shape, axis) << " += input0"
                   << access_dims(input_shape) << ";\n";

            // Closing brackets for Sum loop
@@ -164,7 +108,7 @@ void runtime::intelgpu::do_bcast_sum_operation(cldnn::topology& topology,
                                                   get_kernel_args(1, 1),
                                                   "",
                                                   layout,
-                                                   {1});
+                                                   gws);
    topology.add(op_bcast_sum);
 }


--- a/src/ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp
@@ -27,8 +27,9 @@ namespace ngraph
    {
        namespace intelgpu
        {
-            // This implements Broadcast and Sum nGraph operations
-            // in case of input_shape is not empty
+            // This implements Broadcast and Sum nGraph operations.
+            // input_shape (bcast) or output_shape (sum) can be empty.
+            // If the shape is empty it means scalar
            void do_bcast_sum_operation(cldnn::topology& topology,
                                        const std::string& input_name,
                                        const Shape& input_shape,
@@ -38,16 +39,6 @@ namespace ngraph
                                        const AxisSet& axis,
                                        bool is_bcast);

-            // This implements Broadcast and Sum nGraph operations
-            // in case of input_shape is empty
-            void do_bcast_sum_operation_scalar(cldnn::topology& topology,
-                                               const std::string& input_name,
-                                               const Shape& input_shape,
-                                               const std::string& output_name,
-                                               const Shape& output_shape,
-                                               const element::Type& output_type,
-                                               bool is_bcast);
-
            // This implements Min and Max operations depends on is_min parameter
            void do_max_min_operation(cldnn::topology& topology,
                                      const std::string& input_name,