IntelGPU backend: Operation Reduce implemented (#1736)

* IntelGPU backend: Operation Reduce implemented * PR1736. Style fixed

IntelGPU backend: Operation Reduce implemented (#1736)
* IntelGPU backend: Operation Reduce implemented * PR1736. Style fixed
cae66197 · shssf · Robert Kimball · 7ac35345 · cae66197 · cae66197
Commit cae66197 authored Oct 03, 2018 by shssf Committed by Robert Kimball Oct 03, 2018
5 changed files
--- a/src/ngraph/runtime/intelgpu/CMakeLists.txt
+++ b/src/ngraph/runtime/intelgpu/CMakeLists.txt
@@ -23,6 +23,7 @@ set(SRC
    intelgpu_op_custom_kernels.cpp
    intelgpu_op_convolution.cpp
    intelgpu_op_softmax.cpp
+    intelgpu_op_custom_func_call.cpp
    code_writer.cpp
 )

--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -45,6 +45,7 @@
 #include "ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_convolution.hpp"
+#include "ngraph/runtime/intelgpu/intelgpu_op_custom_func_call.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_softmax.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_tensor_view.hpp"
@@ -69,6 +70,7 @@
 #include "ngraph/op/pad.hpp"
 #include "ngraph/op/parameter_vector.hpp"
 #include "ngraph/op/product.hpp"
+#include "ngraph/op/reduce.hpp"
 #include "ngraph/op/reshape.hpp"
 #include "ngraph/op/reverse.hpp"
 #include "ngraph/op/slice.hpp"
@@ -788,6 +790,27 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
            topology.add(cldnn_activ_grad);
            break;
        }
+        case OP_TYPEID::Reduce:
+        {
+            arguments_check(op, 2, 1);
+            const shared_ptr<op::Reduce> red_op = static_pointer_cast<op::Reduce>(op);
+            const AxisSet& axis = red_op->get_reduction_axes();
+            vector<shared_ptr<Function>> func = red_op->get_functions();
+            // Empty axis is not a case for do_equal_propagation()
+            do_reduce_func_call(topology,
+                                get_input_name(op, 0),
+                                get_input_shape(op, 0),
+                                get_input_name(op, 1),
+                                get_input_shape(op, 1),
+                                get_output_name(op),
+                                get_output_shape(op),
+                                get_output_type(op),
+                                axis,
+                                func);
+            break;
+        }
        case OP_TYPEID::Abs:
        {
            do_unary_operation(topology, op, activation_abs);
@@ -1350,7 +1373,6 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
        case OP_TYPEID::FunctionCall:
        case OP_TYPEID::Dequantize:
        case OP_TYPEID::Quantize:
-        case OP_TYPEID::Reduce:
        case OP_TYPEID::ReduceWindow:
        case OP_TYPEID::ReplaceSlice:
        case OP_TYPEID::ReverseSequence:

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_func_call.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_func_call.cpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include <CPP/custom_gpu_primitive.hpp>
+#include "ngraph/runtime/intelgpu/code_writer.hpp"
+#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
+#include "ngraph/runtime/intelgpu/intelgpu_op_custom_func_call.hpp"
+#include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
+using namespace std;
+using namespace ngraph;
+static void get_custom_func_name(codegen::CodeWriter& writer,
+                                 vector<shared_ptr<Function>>& func,
+                                 const string& func_name,
+                                 const string& type_name)
+{
+    if (func.size() != 1)
+    {
+        throw invalid_argument("IntelGPU Custom_Call operation. Custom function number: " +
+                               to_string(func.size()) + " expected: 1");
+    }
+    writer << type_name << " " << func_name << "(const " << type_name << " input0, const "
+           << type_name << " input1)\n";
+    writer.block_begin();
+    {
+        for (shared_ptr<Node> op : func.at(0)->get_ordered_ops())
+        {
+            if ((op->description() != "Parameter") && (op->description() != "Result"))
+            {
+                if (op->description() == "Multiply")
+                {
+                    writer << "return input0 * input1;\n";
+                }
+                else if (op->description() == "Add")
+                {
+                    writer << "return input0 + input1;\n";
+                }
+                else if (op->description() == "Maximum")
+                {
+                    writer << "return max(input0, input1);\n";
+                }
+                else if (op->description() == "Minimum")
+                {
+                    writer << "return min(input0, input1);\n";
+                }
+                else if (op->description() == "And")
+                {
+                    writer << "return input0 && input1;\n";
+                }
+                else if (op->description() == "Or")
+                {
+                    writer << "return input0 || input1;\n";
+                }
+                else if (op->description() == "Equal")
+                {
+                    writer << "return input0 == input1;\n";
+                }
+                else if (op->description() == "NotEqual")
+                {
+                    writer << "return input0 != input1;\n";
+                }
+                else
+                {
+                    writer << "UNIMPLEMENTED_FUNCTION_INTELGPU: " << op->description() << "\n";
+                }
+            }
+        }
+    } // End of function bracket
+    writer.block_end();
+}
+void runtime::intelgpu::do_reduce_func_call(cldnn::topology& topology,
+                                            const string& input0_name,
+                                            const Shape& input0_shape,
+                                            const string& input1_name,
+                                            const Shape& input1_shape,
+                                            const string& output_name,
+                                            const Shape& output_shape,
+                                            const element::Type& output_type,
+                                            const AxisSet& axis,
+                                            vector<shared_ptr<Function>>& func)
+{
+    const string entry_point_name = "reduce_func_call_" + output_name;
+    const string aux_point_name = "aux_call_" + output_name;
+    const string kernel_type_name = get_opencl_type_name(output_type);
+    const size_t input_size = shape_size<Shape>(input0_shape);
+    codegen::CodeWriter writer;
+    get_custom_func_name(writer, func, aux_point_name, kernel_type_name);
+    // The kernel name and parameters
+    gen_func_def(writer,
+                 entry_point_name,
+                 {2, kernel_type_name},
+                 {input0_shape, {1}},
+                 kernel_type_name,
+                 output_shape);
+    writer.block_begin();
+    {
+        // Initialization loop
+        size_t var_idx = 0;
+        for (auto const& i : output_shape)
+        {
+            writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
+                   << var_idx << ")\n";
+            writer.block_begin();
+            ++var_idx;
+        }
+        writer << "output" << access_dims(output_shape) << " = input1" << access_dims(input1_shape)
+               << ";\n";
+        // Closing brackets for initialization loop
+        for (auto const& i : output_shape)
+        {
+            writer.block_end();
+        }
+        if (input_size && !input0_shape.empty())
+        {
+            // Main operation loop
+            var_idx = 0;
+            for (auto const& i : input0_shape)
+            {
+                writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
+                       << var_idx << ")\n";
+                writer.block_begin();
+                ++var_idx;
+            }
+            writer << "output" << access_dims(input0_shape, "i", axis) << " = " << aux_point_name
+                   << "(output" << access_dims(input0_shape, "i", axis) << ", input0"
+                   << access_dims(input0_shape) << ");\n";
+            // Closing brackets for loop
+            for (auto const& i : input0_shape)
+            {
+                writer.block_end();
+            }
+        }
+    } // End of function bracket
+    writer.block_end();
+    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
+    const cldnn::custom_gpu_primitive op_product(output_name,
+                                                 {input0_name, input1_name},
+                                                 {writer.get_code()},
+                                                 entry_point_name,
+                                                 get_kernel_args(2, 1),
+                                                 "",
+                                                 layout,
+                                                 {1});
+    topology.add(op_product);
+}
--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_func_call.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_func_call.hpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include <CPP/topology.hpp>
+#include "ngraph/axis_set.hpp"
+#include "ngraph/function.hpp"
+#include "ngraph/shape.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace intelgpu
+        {
+            void do_reduce_func_call(cldnn::topology& topology,
+                                     const std::string& input0_name,
+                                     const Shape& input0_shape,
+                                     const std::string& input1_name,
+                                     const Shape& input1_shape,
+                                     const std::string& output_name,
+                                     const Shape& output_shape,
+                                     const element::Type& output_type,
+                                     const AxisSet& axis,
+                                     std::vector<std::shared_ptr<Function>>& func);
+        }
+    }
+}
--- a/src/ngraph/runtime/intelgpu/unit_test.manifest
+++ b/src/ngraph/runtime/intelgpu/unit_test.manifest
@@ -26,15 +26,6 @@ quantize
 quantize_axes
 quantize_clamp
 quantize_int8
-reduce_3d_to_vector
-reduce_matrix_cols_zero
-reduce_matrix_columns
-reduce_matrix_rows
-reduce_matrix_rows_zero
-reduce_matrix_to_scalar_zero_by_zero
-reduce_to_scalar
-reduce_trivial
-reduce_vector_zero
 reduce_window_emulating_max_pool_1d_1channel_1image
 reduce_window_emulating_max_pool_1d_1channel_2image
 reduce_window_emulating_max_pool_1d_2channel_2image