IntelGPU backend: Custom kernels refactoring 2 (#2770)

ab9fad24 · Sergey Shalnov · Robert Kimball · 6f0c8190 · ab9fad24 · ab9fad24
Commit ab9fad24 authored Apr 18, 2019 by Sergey Shalnov Committed by Robert Kimball Apr 18, 2019
8 changed files
--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -53,10 +53,7 @@
 #include "ngraph/runtime/intelgpu/intelgpu_kernels.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp"
-#include "ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp"
-#include "ngraph/runtime/intelgpu/intelgpu_op_custom_func_call.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
-#include "ngraph/runtime/intelgpu/intelgpu_op_softmax.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_tensor_view.hpp"
 #include "ngraph/runtime/intelgpu/visualize_tree.hpp"

@@ -671,14 +668,7 @@ shared_ptr<runtime::Executable>
            if ((shape_dim_count > 3) || ((shape_dim_count == 3) && (axes_size == 2)) ||
                (op->get_input_element_type(0) != element::f32))
            {
-                do_softmax_operation(topology,
-                                     op->get_input_tensor_name(0),
-                                     op->get_input_shape(0),
-                                     op->get_input_element_type(0),
-                                     op->get_output_tensor_name(0),
-                                     op->get_output_shape(0),
-                                     op->get_output_element_type(0),
-                                     axes);
+                kern.emit<op::Softmax>(softmax_op);
            }
            else
            {
@@ -979,15 +969,7 @@ shared_ptr<runtime::Executable>
            }
            else
            {
-                do_bcast_sum_operation(topology,
-                                       op->get_input_tensor_name(0),
-                                       op->get_input_shape(0),
-                                       op->get_input_element_type(0),
-                                       op->get_output_tensor_name(0),
-                                       op->get_output_shape(0),
-                                       op->get_output_element_type(0),
-                                       axis,
-                                       true);
+                kern.emit<op::Broadcast>(broadcast);
            }
            break;
        }
@@ -1005,15 +987,7 @@ shared_ptr<runtime::Executable>
            }
            else
            {
-                do_bcast_sum_operation(topology,
-                                       op->get_input_tensor_name(0),
-                                       op->get_input_shape(0),
-                                       op->get_input_element_type(0),
-                                       op->get_output_tensor_name(0),
-                                       op->get_output_shape(0),
-                                       op->get_output_element_type(0),
-                                       axis,
-                                       false);
+                kern.emit<op::Sum>(sum);
            }
            break;
        }
@@ -1031,13 +1005,7 @@ shared_ptr<runtime::Executable>
            }
            else
            {
-                do_product_operation(topology,
-                                     op->get_input_tensor_name(0),
-                                     op->get_input_shape(0),
-                                     op->get_output_tensor_name(0),
-                                     op->get_output_shape(0),
-                                     op->get_output_element_type(0),
-                                     axis);
+                kern.emit<op::Product>(prod);
            }
            break;
        }
@@ -1098,44 +1066,16 @@ shared_ptr<runtime::Executable>
        {
            arguments_check(op, 1, 1);

-            const shared_ptr<op::All> all_op = static_pointer_cast<op::All>(op);
-            const AxisSet& axis = all_op->get_reduction_axes();
-            const shared_ptr<Node> def_val = all_op->get_default_value();
-            const shared_ptr<op::Constant> def_const = static_pointer_cast<op::Constant>(def_val);
-            const vector<std::string>& values = def_const->get_value_strings();
-
            // Empty axis is not a case for do_equal_propagation()
-            do_all_any_op(topology,
-                          op->get_input_tensor_name(0),
-                          op->get_input_shape(0),
-                          op->get_output_tensor_name(0),
-                          op->get_output_shape(0),
-                          op->get_output_element_type(0),
-                          axis,
-                          "lhs && rhs",
-                          values.at(0));
+            kern.emit<op::All>(static_pointer_cast<op::All>(op));
            break;
        }
        case OP_TYPEID::Any:
        {
            arguments_check(op, 1, 1);

-            const shared_ptr<op::Any> any_op = static_pointer_cast<op::Any>(op);
-            const AxisSet& axis = any_op->get_reduction_axes();
-            const shared_ptr<Node> def_val = any_op->get_default_value();
-            const shared_ptr<op::Constant> def_const = static_pointer_cast<op::Constant>(def_val);
-            const vector<std::string>& values = def_const->get_value_strings();
-
            // Empty axis is not a case for do_equal_propagation()
-            do_all_any_op(topology,
-                          op->get_input_tensor_name(0),
-                          op->get_input_shape(0),
-                          op->get_output_tensor_name(0),
-                          op->get_output_shape(0),
-                          op->get_output_element_type(0),
-                          axis,
-                          "lhs || rhs",
-                          values.at(0));
+            kern.emit<op::Any>(static_pointer_cast<op::Any>(op));
            break;
        }
        case OP_TYPEID::ReluBackprop:
@@ -1788,34 +1728,14 @@ shared_ptr<runtime::Executable>
        {
            arguments_check(op, 1, 1);

-            const shared_ptr<op::Min> min_op = static_pointer_cast<op::Min>(op);
-            const AxisSet& axis = min_op->get_reduction_axes();
-
-            do_max_min_operation(topology,
-                                 op->get_input_tensor_name(0),
-                                 op->get_input_shape(0),
-                                 op->get_output_tensor_name(0),
-                                 op->get_output_shape(0),
-                                 op->get_output_element_type(0),
-                                 axis,
-                                 true);
+            kern.emit<op::Min>(static_pointer_cast<op::Min>(op));
            break;
        }
        case OP_TYPEID::Max:
        {
            arguments_check(op, 1, 1);

-            const shared_ptr<op::Max> max_op = static_pointer_cast<op::Max>(op);
-            const AxisSet& axis = max_op->get_reduction_axes();
-
-            do_max_min_operation(topology,
-                                 op->get_input_tensor_name(0),
-                                 op->get_input_shape(0),
-                                 op->get_output_tensor_name(0),
-                                 op->get_output_shape(0),
-                                 op->get_output_element_type(0),
-                                 axis,
-                                 false);
+            kern.emit<op::Max>(static_pointer_cast<op::Max>(op));
            break;
        }
        case OP_TYPEID::OneHot:

--- a/src/ngraph/runtime/intelgpu/intelgpu_kernels.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_kernels.hpp
@@ -23,9 +23,17 @@
 #include <CPP/topology.hpp>

 #include "ngraph/node.hpp"
+#include "ngraph/op/all.hpp"
+#include "ngraph/op/any.hpp"
+#include "ngraph/op/broadcast.hpp"
 #include "ngraph/op/convolution.hpp"
+#include "ngraph/op/max.hpp"
+#include "ngraph/op/min.hpp"
+#include "ngraph/op/product.hpp"
 #include "ngraph/op/select.hpp"
 #include "ngraph/op/slice.hpp"
+#include "ngraph/op/softmax.hpp"
+#include "ngraph/op/sum.hpp"

 namespace ngraph
 {
@@ -98,11 +106,19 @@ public:
 private:
    void queue_krnl(const krnl_info& krn_info, const std::shared_ptr<Node>& op);

+    krnl_info build_krnl(const std::shared_ptr<op::All>& op) const;
+    krnl_info build_krnl(const std::shared_ptr<op::Any>& op) const;
+    krnl_info build_krnl(const std::shared_ptr<op::Broadcast>& op) const;
    krnl_info build_krnl(const std::shared_ptr<op::Convolution>& op) const;
    krnl_info build_krnl(const std::shared_ptr<op::ConvolutionBackpropData>& op) const;
    krnl_info build_krnl(const std::shared_ptr<op::ConvolutionBackpropFilters>& op) const;
+    krnl_info build_krnl(const std::shared_ptr<op::Max>& op) const;
+    krnl_info build_krnl(const std::shared_ptr<op::Min>& op) const;
+    krnl_info build_krnl(const std::shared_ptr<op::Product>& op) const;
    krnl_info build_krnl(const std::shared_ptr<op::Select>& op) const;
    krnl_info build_krnl(const std::shared_ptr<op::Slice>& op) const;
+    krnl_info build_krnl(const std::shared_ptr<op::Softmax>& op) const;
+    krnl_info build_krnl(const std::shared_ptr<op::Sum>& op) const;

    cldnn::topology& stream;
    size_t m_count_krnls;

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_broadcast.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_broadcast.cpp
@@ -14,32 +14,27 @@
 // limitations under the License.
 //*****************************************************************************

-#include <CPP/concatenation.hpp>
-#include <CPP/custom_gpu_primitive.hpp>
-#include <CPP/reshape.hpp>
-
 #include "ngraph/code_writer.hpp"
-#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
-#include "ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp"
+#include "ngraph/runtime/intelgpu/intelgpu_kernels.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"

 #include "ngraph/util.hpp"

 using namespace std;
 using namespace ngraph;
-
-static void do_sum_to_scalar_operation(cldnn::topology& topology,
-                                       const string& input_name,
-                                       const Shape& input_shape,
-                                       const element::Type& input_type,
-                                       const string& output_name,
-                                       const Shape& output_shape,
-                                       const element::Type& output_type,
-                                       const AxisSet& axis)
+using namespace ngraph::runtime::intelgpu;
+
+static CustomKernels::krnl_info do_sum_to_scalar_operation(const string& input_name,
+                                                           const Shape& input_shape,
+                                                           const element::Type& input_type,
+                                                           const string& output_name,
+                                                           const Shape& output_shape,
+                                                           const element::Type& output_type,
+                                                           const AxisSet& axis)
 {
    const string function_name = "sum_to_scalar_" + output_name;
-    const string input_type_str = runtime::intelgpu::get_opencl_type_name(input_type);
-    const string output_type_str = runtime::intelgpu::get_opencl_type_name(output_type);
+    const string input_type_str = get_opencl_type_name(input_type);
+    const string output_type_str = get_opencl_type_name(output_type);
    const size_t main_loop_count = shape_size(input_shape);
    const size_t vect_channels = 32;
    CodeWriter writer;
@@ -53,10 +48,8 @@ static void do_sum_to_scalar_operation(cldnn::topology& topology,
    writer.block_begin();
    { // Main function body

-        writer << "//  input array dims: input0" << runtime::intelgpu::array_dims(input_shape)
-               << "\n"
-               << "// output array dims: output" << runtime::intelgpu::array_dims(output_shape)
-               << "\n"
+        writer << "//  input array dims: input0" << array_dims(input_shape) << "\n"
+               << "// output array dims: output" << array_dims(output_shape) << "\n"
               << output_type_str << " result = 0.0f;\n"
               << "const uint id = get_sub_group_local_id();\n"
               << "uint element_id = id;\n"
@@ -66,82 +59,80 @@ static void do_sum_to_scalar_operation(cldnn::topology& topology,
        {
            writer << "result += input0[element_id];\n"
                   << "element_id += " << vect_channels << ";\n";
-            writer.block_end();
+        }
+        writer.block_end();

-            writer << "if (element_id < " << main_loop_count << ")\n";
-            writer.block_begin();
-            {
-                writer << "result += input0[element_id];\n";
-            }
-            writer.block_end();
+        writer << "if (element_id < " << main_loop_count << ")\n";
+        writer.block_begin();
+        {
+            writer << "result += input0[element_id];\n";
+        }
+        writer.block_end();

-            writer << output_type_str << " sub_group_result = sub_group_reduce_add(result);\n";
+        writer << output_type_str << " sub_group_result = sub_group_reduce_add(result);\n";

-            writer << "if (id == 0)\n";
-            writer.block_begin();
-            {
-                writer << "*output = sub_group_result;\n";
-            }
-            writer.block_end();
-        } // End of function bracket
+        writer << "if (id == 0)\n";
+        writer.block_begin();
+        {
+            writer << "*output = sub_group_result;\n";
+        }
        writer.block_end();
+    } // End of function bracket
+    writer.block_end();

-        const cldnn::layout layout =
-            runtime::intelgpu::IntelGPULayout::create_cldnn_layout(output_type, output_shape);
-        const cldnn::custom_gpu_primitive op_bcast_sum(output_name,
-                                                       {input_name},
-                                                       {writer.get_code()},
-                                                       function_name,
-                                                       runtime::intelgpu::get_kernel_args(1, 1),
-                                                       "",
-                                                       layout,
-                                                       gws,
-                                                       lws);
-        topology.add(op_bcast_sum);
-    }
+    const CustomKernelInfo op_bcast_sum(output_name,
+                                        output_shape,
+                                        output_type,
+                                        {input_name},
+                                        {writer.get_code()},
+                                        function_name,
+                                        gws,
+                                        lws);
+    return {op_bcast_sum};
 }

-void runtime::intelgpu::do_bcast_sum_operation(cldnn::topology& topology,
-                                               const string& input_name,
-                                               const Shape& input_shape,
-                                               const element::Type& input_type,
-                                               const string& output_name,
-                                               const Shape& output_shape,
-                                               const element::Type& output_type,
-                                               const AxisSet& axis,
-                                               bool is_bcast)
+// This implements Broadcast and Sum nGraph operations.
+// input_shape (bcast) or output_shape (sum) can be empty.
+// If the shape is empty it means scalar
+static CustomKernels::krnl_info
+    do_bcast_sum_operation(const shared_ptr<Node>& op, const AxisSet& axis, bool is_bcast)
 {
+    const string& input_name = op->get_input_tensor_name(0);
+    const Shape& input_shape = op->get_input_shape(0);
+    const element::Type& input_type = op->get_input_element_type(0);
+    const string& output_name = op->get_output_tensor_name(0);
+    const Shape& output_shape = op->get_output_shape(0);
+    const element::Type& output_type = op->get_output_element_type(0);
    string function_name = is_bcast ? "broadcast_" : "sum_";
    function_name += output_name;
    CodeWriter writer;
    vector<size_t> gws;

-    runtime::intelgpu::gen_func_def(writer,
-                                    function_name,
-                                    {get_opencl_type_name(input_type)},
-                                    {input_shape},
-                                    get_opencl_type_name(output_type),
-                                    output_shape);
+    gen_func_def(writer,
+                 function_name,
+                 {get_opencl_type_name(input_type)},
+                 {input_shape},
+                 get_opencl_type_name(output_type),
+                 output_shape);
    writer.block_begin();
    {
        if (is_bcast)
        {
            // Broadcast loops
-            gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
+            gws = generate_loops(writer, output_shape, true);

            writer << "output" << access_dims(output_shape) << " = input0"
                   << access_dims(output_shape, "i", axis) << ";\n";

            // Closing brackets for Broadcast loop
-            runtime::intelgpu::generate_loops(writer, output_shape, false);
+            generate_loops(writer, output_shape, false);
        }
        else
        {
            // corner case with scalar
            if (output_shape.empty() || (!output_shape.empty() && (shape_size(output_shape) == 1)))
            {
-                return do_sum_to_scalar_operation(topology,
-                                                  input_name,
+                return do_sum_to_scalar_operation(input_name,
                                                  input_shape,
                                                  input_type,
                                                  output_name,
@@ -171,39 +162,38 @@ void runtime::intelgpu::do_bcast_sum_operation(cldnn::topology& topology,
    } // End of function bracket
    writer.block_end();

-    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
-    const cldnn::custom_gpu_primitive op_bcast_sum(output_name,
-                                                   {input_name},
-                                                   {writer.get_code()},
-                                                   function_name,
-                                                   get_kernel_args(1, 1),
-                                                   "",
-                                                   layout,
-                                                   gws);
-    topology.add(op_bcast_sum);
+    const CustomKernelInfo op_bcast_sum(output_name,
+                                        output_shape,
+                                        output_type,
+                                        {input_name},
+                                        {writer.get_code()},
+                                        function_name,
+                                        gws);
+    return {op_bcast_sum};
 }

-void runtime::intelgpu::do_max_min_operation(cldnn::topology& topology,
-                                             const string& input_name,
-                                             const Shape& input_shape,
-                                             const string& output_name,
-                                             const Shape& output_shape,
-                                             const element::Type& output_type,
-                                             const AxisSet& axis,
-                                             bool is_min)
+// This implements Min and Max operations depends on is_min parameter
+static CustomKernels::krnl_info
+    do_max_min_operation(const shared_ptr<op::util::ArithmeticReduction>& op, bool is_min)
 {
+    const string& input_name = op->get_input_tensor_name(0);
+    const Shape& input_shape = op->get_input_shape(0);
+    const string& output_name = op->get_output_tensor_name(0);
+    const Shape& output_shape = op->get_output_shape(0);
+    const element::Type& output_type = op->get_output_element_type(0);
+    const AxisSet& axis = op->get_reduction_axes();
    const string function_name = "min_max_" + output_name;
    const size_t input_size = shape_size<Shape>(input_shape);
    const string& init_value = get_opencl_type_min_max_value(output_type, !is_min);
    const string& operation = is_min ? " < " : " > ";
    CodeWriter writer;

-    runtime::intelgpu::gen_func_def(writer,
-                                    function_name,
-                                    {get_opencl_type_name(output_type)},
-                                    {input_shape},
-                                    get_opencl_type_name(output_type),
-                                    output_shape);
+    gen_func_def(writer,
+                 function_name,
+                 {get_opencl_type_name(output_type)},
+                 {input_shape},
+                 get_opencl_type_name(output_type),
+                 output_shape);

    writer.block_begin();
    {
@@ -255,36 +245,29 @@ void runtime::intelgpu::do_max_min_operation(cldnn::topology& topology,
    } // End of function bracket
    writer.block_end();

-    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
-    const cldnn::custom_gpu_primitive op_min_max(output_name,
-                                                 {input_name},
-                                                 {writer.get_code()},
-                                                 function_name,
-                                                 get_kernel_args(1, 1),
-                                                 "",
-                                                 layout,
-                                                 {1});
-    topology.add(op_min_max);
+    const CustomKernelInfo op_min_max(
+        output_name, output_shape, output_type, {input_name}, {writer.get_code()}, function_name);
+    return {op_min_max};
 }

-void runtime::intelgpu::do_product_operation(cldnn::topology& topology,
-                                             const string& input_name,
-                                             const Shape& input_shape,
-                                             const string& output_name,
-                                             const Shape& output_shape,
-                                             const element::Type& output_type,
-                                             const AxisSet& axis)
+CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::Product>& op) const
 {
+    const string& input_name = op->get_input_tensor_name(0);
+    const Shape& input_shape = op->get_input_shape(0);
+    const string& output_name = op->get_output_tensor_name(0);
+    const Shape& output_shape = op->get_output_shape(0);
+    const element::Type& output_type = op->get_output_element_type(0);
+    const AxisSet& axis = op->get_reduction_axes();
    const string function_name = "product_" + output_name;
    const size_t input_size = shape_size<Shape>(input_shape);
    CodeWriter writer;

-    runtime::intelgpu::gen_func_def(writer,
-                                    function_name,
-                                    {get_opencl_type_name(output_type)},
-                                    {input_shape},
-                                    get_opencl_type_name(output_type),
-                                    output_shape);
+    gen_func_def(writer,
+                 function_name,
+                 {get_opencl_type_name(output_type)},
+                 {input_shape},
+                 get_opencl_type_name(output_type),
+                 output_shape);

    writer.block_begin();
    {
@@ -330,14 +313,27 @@ void runtime::intelgpu::do_product_operation(cldnn::topology& topology,
    } // End of function bracket
    writer.block_end();

-    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
-    const cldnn::custom_gpu_primitive op_product(output_name,
-                                                 {input_name},
-                                                 {writer.get_code()},
-                                                 function_name,
-                                                 get_kernel_args(1, 1),
-                                                 "",
-                                                 layout,
-                                                 {1});
-    topology.add(op_product);
+    const CustomKernelInfo op_product(
+        output_name, output_shape, output_type, {input_name}, {writer.get_code()}, function_name);
+    return {op_product};
+}
+
+CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::Broadcast>& op) const
+{
+    return do_bcast_sum_operation(op, op->get_broadcast_axes(), true);
+}
+
+CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::Sum>& op) const
+{
+    return do_bcast_sum_operation(op, op->get_reduction_axes(), false);
+}
+
+CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::Max>& op) const
+{
+    return do_max_min_operation(op, false);
+}
+
+CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::Min>& op) const
+{
+    return do_max_min_operation(op, true);
 }
--- a/src/ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#pragma once
-
-#include <CPP/topology.hpp>
-
-#include "ngraph/axis_set.hpp"
-#include "ngraph/shape.hpp"
-
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace intelgpu
-        {
-            // This implements Broadcast and Sum nGraph operations.
-            // input_shape (bcast) or output_shape (sum) can be empty.
-            // If the shape is empty it means scalar
-            void do_bcast_sum_operation(cldnn::topology& topology,
-                                        const std::string& input_name,
-                                        const Shape& input_shape,
-                                        const element::Type& input_type,
-                                        const std::string& output_name,
-                                        const Shape& output_shape,
-                                        const element::Type& output_type,
-                                        const AxisSet& axis,
-                                        bool is_bcast);
-
-            // This implements Min and Max operations depends on is_min parameter
-            void do_max_min_operation(cldnn::topology& topology,
-                                      const std::string& input_name,
-                                      const Shape& input_shape,
-                                      const std::string& output_name,
-                                      const Shape& output_shape,
-                                      const element::Type& output_type,
-                                      const AxisSet& axis,
-                                      bool is_min);
-
-            // This implements Product operation
-            void do_product_operation(cldnn::topology& topology,
-                                      const std::string& input_name,
-                                      const Shape& input_shape,
-                                      const std::string& output_name,
-                                      const Shape& output_shape,
-                                      const element::Type& output_type,
-                                      const AxisSet& axis);
-        }
-    }
-}
--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_func_call.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_func_call.cpp
@@ -14,26 +14,29 @@
 // limitations under the License.
 //*****************************************************************************

-#include <CPP/custom_gpu_primitive.hpp>
-
 #include "ngraph/code_writer.hpp"
-#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
-#include "ngraph/runtime/intelgpu/intelgpu_op_custom_func_call.hpp"
+#include "ngraph/runtime/intelgpu/intelgpu_kernels.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"

+#include "ngraph/op/constant.hpp"
+
 using namespace std;
 using namespace ngraph;
+using namespace ngraph::runtime::intelgpu;

-void runtime::intelgpu::do_all_any_op(cldnn::topology& topology,
-                                      const string& input0_name,
-                                      const Shape& input0_shape,
-                                      const string& output_name,
-                                      const Shape& output_shape,
-                                      const element::Type& output_type,
-                                      const AxisSet& axis,
-                                      const std::string& operation,
-                                      const std::string& init_val)
+static CustomKernels::krnl_info do_all_any_op(const shared_ptr<op::util::LogicalReduction>& op,
+                                              const string& operation)
 {
+    const string& input0_name = op->get_input_tensor_name(0);
+    const Shape& input0_shape = op->get_input_shape(0);
+    const string& output_name = op->get_output_tensor_name(0);
+    const Shape& output_shape = op->get_output_shape(0);
+    const element::Type& output_type = op->get_output_element_type(0);
+    const AxisSet& axis = op->get_reduction_axes();
+    const shared_ptr<Node> def_val = op->get_default_value();
+    const shared_ptr<op::Constant> def_const = static_pointer_cast<op::Constant>(def_val);
+    const vector<string>& values = def_const->get_value_strings();
+    const string& init_val = values.at(0);
    const string entry_point_name = "custom_op_all_any_" + output_name;
    const string kernel_type_name = get_opencl_type_name(output_type);
    const size_t input_size = shape_size<Shape>(input0_shape);
@@ -94,14 +97,21 @@ void runtime::intelgpu::do_all_any_op(cldnn::topology& topology,
    } // End of function bracket
    writer.block_end();

-    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
-    const cldnn::custom_gpu_primitive op_all_any(output_name,
-                                                 {input0_name},
-                                                 {writer.get_code()},
-                                                 entry_point_name,
-                                                 get_kernel_args(1, 1),
-                                                 "",
-                                                 layout,
-                                                 {1});
-    topology.add(op_all_any);
+    const CustomKernelInfo krn_ret(output_name,
+                                   output_shape,
+                                   output_type,
+                                   {input0_name},
+                                   {writer.get_code()},
+                                   entry_point_name);
+    return {krn_ret};
+}
+
+CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::All>& op) const
+{
+    return do_all_any_op(op, "lhs && rhs");
+}
+
+CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::Any>& op) const
+{
+    return do_all_any_op(op, "lhs || rhs");
 }
--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_func_call.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_func_call.hpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#pragma once
-
-#include <CPP/topology.hpp>
-
-#include "ngraph/axis_set.hpp"
-#include "ngraph/shape.hpp"
-
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace intelgpu
-        {
-            void do_all_any_op(cldnn::topology& topology,
-                               const std::string& input0_name,
-                               const Shape& input0_shape,
-                               const std::string& output_name,
-                               const Shape& output_shape,
-                               const element::Type& output_type,
-                               const AxisSet& axis,
-                               const std::string& operation,
-                               const std::string& init_val);
-        }
-    }
-}
--- a/src/ngraph/runtime/intelgpu/intelgpu_op_softmax.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_softmax.cpp
@@ -14,15 +14,13 @@
 // limitations under the License.
 //*****************************************************************************

-#include <CPP/custom_gpu_primitive.hpp>
-
 #include "ngraph/code_writer.hpp"
-#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
+#include "ngraph/runtime/intelgpu/intelgpu_kernels.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
-#include "ngraph/runtime/intelgpu/intelgpu_op_softmax.hpp"

 using namespace std;
 using namespace ngraph;
+using namespace ngraph::runtime::intelgpu;

 static Shape shape_dims(const Shape& dimentions, const AxisSet& axis = {})
 {
@@ -45,22 +43,20 @@ static Shape shape_dims(const Shape& dimentions, const AxisSet& axis = {})
    return output_shape;
 }

-void runtime::intelgpu::do_softmax_operation(cldnn::topology& topology,
-                                             const string& input_name,
-                                             const Shape& input_shape,
-                                             const element::Type& input_type,
-                                             const string& output_name,
-                                             const Shape& output_shape,
-                                             const element::Type& output_type,
-                                             const AxisSet& axes)
+CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::Softmax>& op) const
 {
-    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
+    const string& input_name = op->get_input_tensor_name(0);
+    const Shape& input_shape = op->get_input_shape(0);
+    const element::Type& input_type = op->get_input_element_type(0);
+    const string& output_name = op->get_output_tensor_name(0);
+    const Shape& output_shape = op->get_output_shape(0);
+    const element::Type& output_type = op->get_output_element_type(0);
+    const AxisSet& axes = op->get_axes();
    const string entry_point_name = "softmax_" + output_name;
    const string middle_name = entry_point_name + "_middle";
    const string entry_point_middle_name = "softmax_middle_" + output_name;
    const string expression = "output" + access_dims(input_shape, "i", axes) + " = 0.0f;\n";
    const Shape new_shape = shape_dims(output_shape, axes);
-    const cldnn::layout layout_middle = IntelGPULayout::create_cldnn_layout(output_type, new_shape);
    CodeWriter writer0;
    CodeWriter writer1;
    vector<size_t> gws;
@@ -81,15 +77,13 @@ void runtime::intelgpu::do_softmax_operation(cldnn::topology& topology,
    }
    writer0.block_end();

-    const cldnn::custom_gpu_primitive op_softmax_middle(middle_name,
-                                                        {input_name},
-                                                        {writer0.get_code()},
-                                                        entry_point_middle_name,
-                                                        get_kernel_args(1, 1),
-                                                        "",
-                                                        layout_middle,
-                                                        gws);
-    topology.add(op_softmax_middle);
+    const CustomKernelInfo op_softmax_middle(middle_name,
+                                             new_shape,
+                                             output_type,
+                                             {input_name},
+                                             {writer0.get_code()},
+                                             entry_point_middle_name,
+                                             gws);

    writer1 << "__kernel void " << entry_point_name << "(const __global "
            << get_opencl_type_name(input_type) << " input0" << array_dims(input_shape)
@@ -107,13 +101,12 @@ void runtime::intelgpu::do_softmax_operation(cldnn::topology& topology,
    }
    writer1.block_end();

-    const cldnn::custom_gpu_primitive op_softmax(output_name,
-                                                 {input_name, middle_name},
-                                                 {writer1.get_code()},
-                                                 entry_point_name,
-                                                 get_kernel_args(2, 1),
-                                                 "",
-                                                 layout,
-                                                 gws);
-    topology.add(op_softmax);
+    const CustomKernelInfo op_softmax(output_name,
+                                      output_shape,
+                                      output_type,
+                                      {input_name, middle_name},
+                                      {writer1.get_code()},
+                                      entry_point_name,
+                                      gws);
+    return {op_softmax_middle, op_softmax};
 }
--- a/src/ngraph/runtime/intelgpu/intelgpu_op_softmax.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_softmax.hpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#pragma once
-
-#include <CPP/topology.hpp>
-
-#include "ngraph/shape.hpp"
-#include "ngraph/type/element_type.hpp"
-
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace intelgpu
-        {
-            void do_softmax_operation(cldnn::topology& topology,
-                                      const std::string& input_name,
-                                      const Shape& input_shape,
-                                      const element::Type& input_type,
-                                      const std::string& output_name,
-                                      const Shape& output_shape,
-                                      const element::Type& output_type,
-                                      const AxisSet& axes);
-        }
-    }
-}