IntelGPU backend: Sum and redeveloped Broadcast operation (#1276)

92adea38 · shssf · Scott Cyphers · cb84305e · 92adea38 · 92adea38
Commit 92adea38 authored Aug 01, 2018 by shssf Committed by Scott Cyphers Aug 01, 2018
3 changed files
--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -289,6 +289,8 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)

            const string& output_name = op->get_outputs().begin()->get_tensor().get_name();
            const Shape& output_shape = op->get_outputs().begin()->get_shape();
+            const element::Type& output_type =
+                op->get_outputs().begin()->get_tensor().get_element_type();

            const shared_ptr<op::Broadcast> broadcast = static_pointer_cast<op::Broadcast>(op);
            const AxisSet& axis = broadcast->get_broadcast_axes();
@@ -297,10 +299,67 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
            {
                do_equal_propagation(topology, input_name, output_name);
            }
+            else if (input_shape.empty())
+            {
+                do_bcast_sum_operation_scalar(topology,
+                                              input_name,
+                                              input_shape,
+                                              output_name,
+                                              output_shape,
+                                              output_type,
+                                              true);
+            }
+            else
+            {
+                do_bcast_sum_operation(topology,
+                                       input_name,
+                                       input_shape,
+                                       output_name,
+                                       output_shape,
+                                       output_type,
+                                       axis,
+                                       true);
+            }
+        }
+        else if ("Sum" == op->description())
+        {
+            arguments_check(op, 1, 1);
+
+            const string& input_name = op->get_inputs().begin()->get_tensor().get_name();
+            const Shape& input_shape = op->get_inputs().begin()->get_shape();
+
+            const string& output_name = op->get_outputs().begin()->get_tensor().get_name();
+            const Shape& output_shape = op->get_outputs().begin()->get_shape();
+            const element::Type& output_type =
+                op->get_outputs().begin()->get_tensor().get_element_type();
+
+            const shared_ptr<op::Sum> sum = static_pointer_cast<op::Sum>(op);
+            const AxisSet& axis = sum->get_reduction_axes();
+
+            if (axis.empty())
+            {
+                do_equal_propagation(topology, input_name, output_name);
+            }
+            else if (output_shape.empty())
+            {
+                do_bcast_sum_operation_scalar(topology,
+                                              input_name,
+                                              input_shape,
+                                              output_name,
+                                              output_shape,
+                                              output_type,
+                                              false);
+            }
            else
            {
-                do_broadcast_operation(
-                    topology, input_name, input_shape, output_name, output_shape, axis);
+                do_bcast_sum_operation(topology,
+                                       input_name,
+                                       input_shape,
+                                       output_name,
+                                       output_shape,
+                                       output_type,
+                                       axis,
+                                       false);
            }
        }
        else if ("Reshape" == op->description())

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_broadcast.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_broadcast.cpp
@@ -15,8 +15,10 @@
 *******************************************************************************/

 #include <CPP/concatenation.hpp>
+#include <CPP/custom_gpu_primitive.hpp>
 #include <CPP/reshape.hpp>

+#include "ngraph/runtime/intelgpu/code_writer.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp"

@@ -25,160 +27,150 @@
 using namespace std;
 using namespace ngraph;

-static const string reshape_suf("_reshape");
+static vector<cldnn_arg> parameters_1inp_1out = {{arg_input, 0}, {arg_output, 0}};

-static Shape propagate_backward(const Shape& input)
+static string array_dims(const Shape& dimentions)
 {
-    Shape result({0, 0, 0, 0});
-    size_t idx = result.size() - 1;
+    string buffer;

-    for (auto i = input.crbegin(); i != input.crend(); ++i, --idx)
+    for (auto const& dim : dimentions)
    {
-        result.at(idx) = *i;
+        buffer += "[" + to_string(dim) + "]";
    }

-    return result;
+    return buffer;
 }

-static Shape propagate_forward(const Shape& input)
+static string access_dims(const Shape& dimentions, const AxisSet& axis = {})
 {
-    Shape result({0, 0, 0, 0});
-    size_t idx = 0;
+    size_t var_idx = 0;
+    string buffer;

-    for (auto i = input.cbegin(); i != input.cend(); ++i, ++idx)
+    for (auto const& i : dimentions)
    {
-        result.at(idx) = *i;
-    }
-
-    return result;
-}
-
-static Shape apply_axis(const Shape& input, const AxisSet& axis)
-{
-    Shape result = input;
-
-    for (auto const& i : axis)
+        if (axis.find(var_idx) == axis.end())
        {
-        result.at(i) = 0;
+            buffer += "[i" + to_string(var_idx) + "]";
+        }
+        ++var_idx;
    }

-    return result;
+    return buffer;
 }

-// This function broadcast input data to all other dimensions of the output
-// it operates in two mode only (controlled by is_forward flag):
-// [forward]: propagate data from left to right in Shape array term
-//            in[2], out[2,3,4,5], axis[1,2,3]
-// [backward]: propagate data from right to left in Shape array term
-//            in[5], out[2,3,4,5], axis[0,1,2]
-// Input and output shapes can be up to 4 dimensions
-// Other variants, like: in[4] out[2,3,4,5] axis[0,1,3], unsupported yet
-static void do_propagation(cldnn::topology& topology,
+void runtime::intelgpu::do_bcast_sum_operation_scalar(cldnn::topology& topology,
                                                      const string& input_name,
                                                      const Shape& input_shape,
                                                      const string& output_name,
                                                      const Shape& output_shape,
-                           const AxisSet& axis,
-                           bool is_forward)
+                                                      const element::Type& output_type,
+                                                      bool is_bcast)
 {
-    //default value used in "forward" mode
-    cldnn::concatenation::concatenation_axis direction =
-        runtime::intelgpu::IntelGPULayout::get_cldnn_axis(3);
-
-    string input_name_it = input_name;
-    string output_name_it = output_name;
-    Shape input_shape_it = input_shape;
-    for (auto axis_id = axis.crbegin(); axis_id != axis.crend();)
-    {
-        const size_t input_count = output_shape.at(*axis_id);
-
-        if (is_forward)
+    const string function_name = is_bcast ? "broadcast_scalar" : "sum_scalar";
+    const size_t input_count =
+        is_bcast ? shape_size<Shape>(output_shape) : shape_size<Shape>(input_shape);
+    codegen::CodeWriter writer;
+
+    writer << "__kernel void " << function_name
+           << "(const __global float* input, __global float* output)\n";
+    writer.block_begin();
    {
-            input_shape_it.push_back(1);
-            const cldnn::tensor my_tensor =
-                runtime::intelgpu::IntelGPULayout::create_cldnn_tensor(input_shape_it);
+        writer << "float sum = 0.f;\n"
+               << "for (uint i = 0; i < COUNT; ++i)\n";
+        writer.block_begin();

-            const cldnn::reshape op_reshape(input_name_it + reshape_suf, input_name_it, my_tensor);
-            topology.add(op_reshape);
-
-            input_shape_it.back() = input_count;
-            input_name_it += reshape_suf;
-        }
-        else
-        {
-            direction = runtime::intelgpu::IntelGPULayout::get_cldnn_axis(*axis_id);
-        }
-
-        const vector<cldnn::primitive_id> input_names(input_count, input_name_it);
-
-        ++axis_id;
-        if (axis_id == axis.crend())
+        if (is_bcast)
        {
-            output_name_it = output_name;
+            writer << "output[i] = input[0];\n";
+            writer.block_end();
        }
        else
        {
-            output_name_it += ":_";
-            input_name_it = output_name_it;
+            writer << "sum += input[i];\n";
+            writer.block_end();
+            writer << "output[0] = sum;\n";
        }
-
-        const cldnn::concatenation op_concat(output_name_it, input_names, direction);
-        topology.add(op_concat);
-    }
-}
-
-// Assume input is scalar. All output data will be populated by the scalar
-// The function extremely non optimal from performance perspective
-static void do_scalar_propagation(cldnn::topology& topology,
-                                  const string& input_name,
-                                  const string& output_name,
-                                  const Shape& output_shape)
-{
-    const size_t input_count = shape_size<const Shape>(output_shape);
-    const vector<cldnn::primitive_id> input_names(input_count, input_name);
-
-    const cldnn::concatenation op_concat(output_name, input_names, cldnn::concatenation::along_x);
-    topology.add(op_concat);
+    } // End of function bracket
+    writer.block_end();
+
+    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
+    const cldnn::custom_gpu_primitive op_scalar(output_name,
+                                                {input_name},
+                                                {writer.get_code()},
+                                                function_name,
+                                                parameters_1inp_1out,
+                                                string("-DCOUNT=" + to_string(input_count)),
+                                                layout);
+    topology.add(op_scalar);
 }

-void runtime::intelgpu::do_broadcast_operation(cldnn::topology& topology,
+void runtime::intelgpu::do_bcast_sum_operation(cldnn::topology& topology,
                                               const string& input_name,
                                               const Shape& input_shape,
                                               const string& output_name,
                                               const Shape& output_shape,
-                                               const AxisSet& axis)
+                                               const element::Type& output_type,
+                                               const AxisSet& axis,
+                                               bool is_bcast)
 {
-    if (input_shape.size() > 4 || output_shape.size() > 4)
-    {
-        throw invalid_argument("IntelGPU::Broadcast supports 4D shapes maximum.");
-    }
+    const string function_name = is_bcast ? "broadcast" : "sum";
+    codegen::CodeWriter writer;

-    if (input_shape.empty())
-    {
-        do_scalar_propagation(topology, input_name, output_name, output_shape);
+    writer << "__kernel void " << function_name << "(const __global float input"
+           << array_dims(input_shape) << ", __global float output" << array_dims(output_shape)
+           << ")\n";

-        return;
+    writer.block_begin();
+    {
+        if (is_bcast)
+        {
+            size_t var_idx = 0;
+            for (auto const& i : output_shape)
+            {
+                writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
+                       << var_idx << ")\n";
+                writer.block_begin();
+                ++var_idx;
            }
+            writer << "output" << access_dims(output_shape) << " = input"
+                   << access_dims(output_shape, axis) << ";\n";

-    const Shape output_shape_axis = apply_axis(output_shape, axis);
-    const Shape input_shape_forward = propagate_forward(input_shape);
-    const Shape output_shape_forward = propagate_forward(output_shape_axis);
-    const Shape input_shape_backward = propagate_backward(input_shape);
-    const Shape output_shape_backward = propagate_backward(output_shape_axis);
-
-    if (input_shape_forward == output_shape_forward)
+            // Closing brackets for Broadcast loop
+            for (auto const& i : output_shape)
            {
-        do_propagation(topology, input_name, input_shape, output_name, output_shape, axis, true);
+                writer.block_end();
            }
-    else if (input_shape_backward == output_shape_backward)
-    {
-        do_propagation(topology, input_name, input_shape, output_name, output_shape, axis, false);
        }
        else
        {
-        ostringstream os;
-        os << "IntelGP::Broadcast unsupported mode. input" << vector_to_string(input_shape)
-           << " output" << vector_to_string(output_shape) << " axis" << vector_to_string(axis);
-        throw invalid_argument(os.str());
+            size_t var_idx = 0;
+            for (auto const& i : input_shape)
+            {
+                writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
+                       << var_idx << ")\n";
+                writer.block_begin();
+                ++var_idx;
+            }
+
+            writer << "output" << access_dims(input_shape, axis) << " += input"
+                   << access_dims(input_shape) << ";\n";
+
+            // Closing brackets for Sum loop
+            for (auto const& i : input_shape)
+            {
+                writer.block_end();
+            }
        }
+    } // End of function bracket
+    writer.block_end();
+
+    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
+    const cldnn::custom_gpu_primitive op_bcast_sum(output_name,
+                                                   {input_name},
+                                                   {writer.get_code()},
+                                                   function_name,
+                                                   parameters_1inp_1out,
+                                                   "",
+                                                   layout);
+    topology.add(op_bcast_sum);
 }
--- a/src/ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp
@@ -27,13 +27,26 @@ namespace ngraph
    {
        namespace intelgpu
        {
-            // This implements Broadcast nGraph operation
-            void do_broadcast_operation(cldnn::topology& topology,
+            // This implements Broadcast and Sum nGraph operations
+            // in case of input_shape is not empty
+            void do_bcast_sum_operation(cldnn::topology& topology,
                                        const std::string& input_name,
                                        const Shape& input_shape,
                                        const std::string& output_name,
                                        const Shape& output_shape,
-                                        const AxisSet& axis);
+                                        const element::Type& output_type,
+                                        const AxisSet& axis,
+                                        bool is_bcast);
+
+            // This implements Broadcast and Sum nGraph operations
+            // in case of input_shape is empty
+            void do_bcast_sum_operation_scalar(cldnn::topology& topology,
+                                               const std::string& input_name,
+                                               const Shape& input_shape,
+                                               const std::string& output_name,
+                                               const Shape& output_shape,
+                                               const element::Type& output_type,
+                                               bool is_bcast);
        }
    }
 }