IntelGPU backend: Custom kernels refactoring (#2757)

* IntelGPU backend: Custom kernels refactoring * IntelGPU backend: remove unused header

IntelGPU backend: Custom kernels refactoring (#2757)
* IntelGPU backend: Custom kernels refactoring * IntelGPU backend: remove unused header
86641478 · Sergey Shalnov · Scott Cyphers · d46330de · 86641478 · 86641478
Commit 86641478 authored Apr 16, 2019 by Sergey Shalnov Committed by Scott Cyphers Apr 16, 2019
7 changed files
--- a/src/ngraph/runtime/intelgpu/CMakeLists.txt
+++ b/src/ngraph/runtime/intelgpu/CMakeLists.txt
@@ -19,6 +19,7 @@ set(SRC
    intelgpu_executable.cpp
    intelgpu_tensor_view.cpp
    intelgpu_layout.cpp
+    intelgpu_kernels.cpp
    intelgpu_op_batchnorm.cpp
    intelgpu_op_broadcast.cpp
    intelgpu_op_custom_kernels.cpp

--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -50,10 +50,10 @@
 #include "ngraph/pass/reshape_elimination.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_backend.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_executable.hpp"
+#include "ngraph/runtime/intelgpu/intelgpu_kernels.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp"
-#include "ngraph/runtime/intelgpu/intelgpu_op_convolution.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_custom_func_call.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_softmax.hpp"
@@ -89,6 +89,7 @@
 #include "ngraph/op/reshape.hpp"
 #include "ngraph/op/reverse.hpp"
 #include "ngraph/op/reverse_sequence.hpp"
+#include "ngraph/op/select.hpp"
 #include "ngraph/op/slice.hpp"
 #include "ngraph/op/softmax.hpp"
 #include "ngraph/op/sum.hpp"
@@ -391,6 +392,7 @@ shared_ptr<runtime::Executable>

    set<cldnn::primitive_id> func_output_names;
    cldnn::topology topology;
+    CustomKernels kern(topology);
    stopwatch timer_compile;
    double consumed_memory = 0.0;
    double compilation_time = 0.0;
@@ -487,15 +489,7 @@ shared_ptr<runtime::Executable>
            }
            else
            {
-                do_slice_operation(topology,
-                                   op->get_input_tensor_name(0),
-                                   op->get_input_shape(0),
-                                   op->get_output_tensor_name(0),
-                                   op->get_output_shape(0),
-                                   op->get_output_element_type(0),
-                                   lower_bounds,
-                                   upper_bounds,
-                                   strides);
+                kern.emit<op::Slice>(elem);
            }
            break;
        }
@@ -505,16 +499,7 @@ shared_ptr<runtime::Executable>

            if (op->get_output_element_type(0) != element::f32)
            {
-                do_select_operation(topology,
-                                    op->get_input_tensor_name(0),
-                                    op->get_input_shape(0),
-                                    op->get_input_tensor_name(1),
-                                    op->get_input_shape(1),
-                                    op->get_input_tensor_name(2),
-                                    op->get_input_shape(2),
-                                    op->get_output_tensor_name(0),
-                                    op->get_output_shape(0),
-                                    op->get_output_element_type(0));
+                kern.emit<op::Select>(static_pointer_cast<op::Select>(op));
            }
            else
            {
@@ -1605,25 +1590,7 @@ shared_ptr<runtime::Executable>
                (data_dilation.at(0) != 1) || (data_dilation.at(1) != 1) ||
                (op->get_output_element_type(0) != element::f32))
            {
-                do_convolution_operation(topology,
-                                         op->get_input_tensor_name(0),
-                                         op->get_input_shape(0),
-                                         op->get_input_tensor_name(1),
-                                         op->get_input_shape(1),
-                                         op->get_output_tensor_name(0),
-                                         op->get_output_shape(0),
-                                         op->get_output_element_type(0),
-                                         conv_op->get_padding_below(),
-                                         conv_op->get_window_movement_strides(),
-                                         conv_op->get_window_dilation_strides(),
-                                         conv_op->get_data_dilation_strides(),
-                                         0,
-                                         1,
-                                         1,
-                                         "input[batch][input_channel]",
-                                         "filter[output_channel][input_channel]",
-                                         "output[batch][output_channel]",
-                                         false);
+                kern.emit<op::Convolution>(conv_op);
            }
            else
            {
@@ -1691,25 +1658,7 @@ shared_ptr<runtime::Executable>
                (win_dilation.size() != 2) || (op->get_output_element_type(0) != element::f32) ||
                proceed_with_custom_kernel)
            {
-                do_convolution_operation(topology,
-                                         op->get_input_tensor_name(0),
-                                         op->get_input_shape(0),
-                                         op->get_input_tensor_name(1),
-                                         op->get_input_shape(1),
-                                         op->get_output_tensor_name(0),
-                                         op->get_output_shape(0),
-                                         op->get_output_element_type(0),
-                                         conv_op->get_padding_below_forward(),
-                                         win_stride,
-                                         win_dilation,
-                                         data_dilation,
-                                         1,
-                                         0,
-                                         0,
-                                         "input[input_channel][batch]",
-                                         "filter[input_channel][output_channel]",
-                                         "output[output_channel][batch]",
-                                         false);
+                kern.emit<op::ConvolutionBackpropFilters>(conv_op);
            }
            else
            {
@@ -1793,25 +1742,7 @@ shared_ptr<runtime::Executable>
                (win_dilation.at(1) != 1) || (op->get_output_element_type(0) != element::f32) ||
                ((pad_below.at(0) == pad_above.at(0)) && (pad_below.at(1) == pad_above.at(1))))
            {
-                do_convolution_operation(topology,
-                                         op->get_input_tensor_name(1),
-                                         op->get_input_shape(1),
-                                         op->get_input_tensor_name(0),
-                                         op->get_input_shape(0),
-                                         op->get_output_tensor_name(0),
-                                         op->get_output_shape(0),
-                                         op->get_output_element_type(0),
-                                         pad_below,
-                                         win_stride,
-                                         win_dilation,
-                                         data_dilation,
-                                         0,
-                                         1,
-                                         1,
-                                         "input[batch][input_channel]",
-                                         "filter[input_channel][output_channel]",
-                                         "output[batch][output_channel]",
-                                         true);
+                kern.emit<op::ConvolutionBackpropData>(conv_op);
            }
            else
            {

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_convolution.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_convolution.hpp
@@ -14,42 +14,33 @@
 // limitations under the License.
 //*****************************************************************************

-#pragma once
+#include <CPP/custom_gpu_primitive.hpp>

-#include <CPP/topology.hpp>
+#include "ngraph/runtime/intelgpu/intelgpu_kernels.hpp"
+#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
+#include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"

-#include "ngraph/coordinate_diff.hpp"
-#include "ngraph/shape.hpp"
-#include "ngraph/strides.hpp"
-#include "ngraph/type/element_type.hpp"
+#include "ngraph/node.hpp"

-namespace ngraph
+using namespace std;
+using namespace ngraph;
+
+void runtime::intelgpu::CustomKernels::queue_krnl(const krnl_info& krnl_info,
+                                                  const shared_ptr<Node>& op)
 {
-    namespace runtime
+    for (const auto& kr : krnl_info)
    {
-        namespace intelgpu
-        {
-            // This implements Convolution nGraph operation
-            // nGraph uses channels in this operation but clDNN uses full input data
-            void do_convolution_operation(cldnn::topology& topology,
-                                          const std::string& input_name,
-                                          const Shape& input_shape,
-                                          const std::string& filter_name,
-                                          const Shape& filter_shape,
-                                          const std::string& output_name,
-                                          const Shape& output_shape,
-                                          const element::Type& output_type,
-                                          const CoordinateDiff& pad_below,
-                                          const Strides& win_stride,
-                                          const Strides& win_dilation,
-                                          const Strides& data_dilation,
-                                          size_t batch_axis_data,
-                                          size_t input_channel_axis_data,
-                                          size_t output_channel_axis_result,
-                                          const std::string& input_order,
-                                          const std::string& filter_order,
-                                          const std::string& output_order,
-                                          bool reverse_filter);
-        }
+        const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(kr.m_type, kr.m_shape);
+
+        const cldnn::custom_gpu_primitive kernel_item(kr.m_name,
+                                                      kr.m_inputs,
+                                                      {kr.m_code},
+                                                      kr.m_entry_point,
+                                                      get_kernel_args(kr.m_inputs.size(), 1),
+                                                      "",
+                                                      layout,
+                                                      kr.m_gws,
+                                                      kr.m_lws);
+        stream.add(kernel_item);
    }
 }
--- a/src/ngraph/runtime/intelgpu/intelgpu_kernels.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_kernels.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <CPP/topology.hpp>
+
+#include "ngraph/node.hpp"
+#include "ngraph/op/convolution.hpp"
+#include "ngraph/op/select.hpp"
+#include "ngraph/op/slice.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace intelgpu
+        {
+            class CustomKernelInfo;
+            class CustomKernels;
+        }
+    }
+}
+
+class ngraph::runtime::intelgpu::CustomKernelInfo
+{
+public:
+    CustomKernelInfo(const std::string& name,
+                     const Shape& shape,
+                     const element::Type& type,
+                     const std::vector<std::string>& inputs,
+                     const std::string& code,
+                     const std::string& entry_point,
+                     const std::vector<size_t>& gws = {1},
+                     const std::vector<size_t>& lws = {1})
+    {
+        m_name = name;
+        m_shape = shape;
+        m_type = type;
+        m_inputs = inputs;
+        m_code = code;
+        m_entry_point = entry_point;
+        m_gws = gws;
+        m_lws = lws;
+    }
+
+    std::string m_name;
+    Shape m_shape;
+    element::Type m_type;
+    std::vector<std::string> m_inputs;
+    std::string m_code;
+    std::string m_entry_point;
+    std::vector<size_t> m_gws;
+    std::vector<size_t> m_lws;
+};
+
+class ngraph::runtime::intelgpu::CustomKernels
+{
+public:
+    using krnl_info = std::vector<CustomKernelInfo>;
+
+    explicit CustomKernels(cldnn::topology& backend_stream)
+        : stream(backend_stream)
+    {
+        m_count_krnls = 0;
+    }
+
+    template <typename OP>
+    void emit(const std::shared_ptr<OP>& op)
+    {
+        krnl_info krnl_info;
+
+        krnl_info = build_krnl(op);
+
+        queue_krnl(krnl_info, op);
+
+        ++m_count_krnls;
+    }
+
+    size_t get_custom_kernel_count() const { return m_count_krnls; }
+private:
+    void queue_krnl(const krnl_info& krn_info, const std::shared_ptr<Node>& op);
+
+    krnl_info build_krnl(const std::shared_ptr<op::Convolution>& op) const;
+    krnl_info build_krnl(const std::shared_ptr<op::ConvolutionBackpropData>& op) const;
+    krnl_info build_krnl(const std::shared_ptr<op::ConvolutionBackpropFilters>& op) const;
+    krnl_info build_krnl(const std::shared_ptr<op::Select>& op) const;
+    krnl_info build_krnl(const std::shared_ptr<op::Slice>& op) const;
+
+    cldnn::topology& stream;
+    size_t m_count_krnls;
+};
--- a/src/ngraph/runtime/intelgpu/intelgpu_op_convolution.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_convolution.cpp
--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
@@ -21,6 +21,7 @@
 #include <CPP/custom_gpu_primitive.hpp>
 #include <CPP/reshape.hpp>

+#include "ngraph/runtime/intelgpu/intelgpu_kernels.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"

@@ -28,6 +29,7 @@

 using namespace std;
 using namespace ngraph;
+using namespace ngraph::runtime::intelgpu;

 string runtime::intelgpu::get_opencl_type_name(const element::Type& ngraph_type)
 {
@@ -1036,17 +1038,16 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology,
    topology.add(op_dot);
 }

-void runtime::intelgpu::do_slice_operation(cldnn::topology& topology,
-                                           const string& input_name,
-                                           const Shape& input_shape,
-                                           const string& output_name,
-                                           const Shape& output_shape,
-                                           const element::Type& output_type,
-                                           const Coordinate& lower_bounds,
-                                           const Coordinate& uppper_bounds,
-                                           const Strides& strides)
+CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::Slice>& op) const
 {
-    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
+    const string& input_name = op->get_input_tensor_name(0);
+    const Shape& input_shape = op->get_input_shape(0);
+    const string& output_name = op->get_output_tensor_name(0);
+    const Shape& output_shape = op->get_output_shape(0);
+    const element::Type& output_type = op->get_output_element_type(0);
+    const Coordinate& lower_bounds = op->get_lower_bounds();
+    const Coordinate& uppper_bounds = op->get_upper_bounds();
+    const Strides& strides = op->get_strides();
    const string entry_point_name = "slice_" + output_name;
    CodeWriter writer;
    vector<size_t> gws;
@@ -1071,15 +1072,14 @@ void runtime::intelgpu::do_slice_operation(cldnn::topology& topology,
    }
    writer.block_end();

-    const cldnn::custom_gpu_primitive op_slice(output_name,
-                                               {input_name},
-                                               {writer.get_code()},
-                                               entry_point_name,
-                                               get_kernel_args(1, 1),
-                                               "",
-                                               layout,
-                                               gws);
-    topology.add(op_slice);
+    const CustomKernelInfo krn_ret(output_name,
+                                   output_shape,
+                                   output_type,
+                                   {input_name},
+                                   {writer.get_code()},
+                                   entry_point_name,
+                                   gws);
+    return {krn_ret};
 }

 void runtime::intelgpu::do_concat_operation(cldnn::topology& topology,
@@ -1225,18 +1225,17 @@ void runtime::intelgpu::do_concat_operation(cldnn::topology& topology,
    }
 }

-void runtime::intelgpu::do_select_operation(cldnn::topology& topology,
-                                            const string& input0_name,
-                                            const Shape& input0_shape,
-                                            const string& input1_name,
-                                            const Shape& input1_shape,
-                                            const string& input2_name,
-                                            const Shape& input2_shape,
-                                            const string& output_name,
-                                            const Shape& output_shape,
-                                            const element::Type& output_type)
+CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::Select>& op) const
 {
-    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
+    const string& input0_name = op->get_input_tensor_name(0);
+    const Shape& input0_shape = op->get_input_shape(0);
+    const string& input1_name = op->get_input_tensor_name(1);
+    const Shape& input1_shape = op->get_input_shape(1);
+    const string& input2_name = op->get_input_tensor_name(2);
+    const Shape& input2_shape = op->get_input_shape(2);
+    const string& output_name = op->get_output_tensor_name(0);
+    const Shape& output_shape = op->get_output_shape(0);
+    const element::Type& output_type = op->get_output_element_type(0);
    const string entry_point_name = "select_" + output_name;
    CodeWriter writer;
    vector<size_t> gws;
@@ -1262,15 +1261,14 @@ void runtime::intelgpu::do_select_operation(cldnn::topology& topology,
    }
    writer.block_end();

-    const cldnn::custom_gpu_primitive op_select(output_name,
-                                                {input0_name, input1_name, input2_name},
-                                                {writer.get_code()},
-                                                entry_point_name,
-                                                get_kernel_args(3, 1),
-                                                "",
-                                                layout,
-                                                gws);
-    topology.add(op_select);
+    const CustomKernelInfo krn_ret(output_name,
+                                   output_shape,
+                                   output_type,
+                                   {input0_name, input1_name, input2_name},
+                                   {writer.get_code()},
+                                   entry_point_name,
+                                   gws);
+    return {krn_ret};
 }

 void runtime::intelgpu::do_logic_kernel(cldnn::topology& topology,

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp
@@ -92,16 +92,6 @@ namespace ngraph
                                  const element::Type& output_type,
                                  size_t reduction_axes_count);

-            void do_slice_operation(cldnn::topology& topology,
-                                    const std::string& input_name,
-                                    const Shape& input_shape,
-                                    const std::string& output_name,
-                                    const Shape& output_shape,
-                                    const element::Type& output_type,
-                                    const Coordinate& lower_bounds,
-                                    const Coordinate& uppper_bounds,
-                                    const Strides& strides);
-
            void do_concat_operation(cldnn::topology& topology,
                                     const std::vector<std::string>& input_names,
                                     const std::vector<Shape>& input_shapes,
@@ -110,17 +100,6 @@ namespace ngraph
                                     const element::Type& output_type,
                                     size_t concat_axis);

-            void do_select_operation(cldnn::topology& topology,
-                                     const std::string& input0_name,
-                                     const Shape& input0_shape,
-                                     const std::string& input1_name,
-                                     const Shape& input1_shape,
-                                     const std::string& input2_name,
-                                     const Shape& input2_shape,
-                                     const std::string& output_name,
-                                     const Shape& output_shape,
-                                     const element::Type& output_type);
-
            void do_logic_kernel(cldnn::topology& topology,
                                 const std::string& input0_name,
                                 const Shape& input0_shape,