Commit 86641478 authored by Sergey Shalnov's avatar Sergey Shalnov Committed by Scott Cyphers

IntelGPU backend: Custom kernels refactoring (#2757)

* IntelGPU backend: Custom kernels refactoring

* IntelGPU backend: remove unused header
parent d46330de
...@@ -19,6 +19,7 @@ set(SRC ...@@ -19,6 +19,7 @@ set(SRC
intelgpu_executable.cpp intelgpu_executable.cpp
intelgpu_tensor_view.cpp intelgpu_tensor_view.cpp
intelgpu_layout.cpp intelgpu_layout.cpp
intelgpu_kernels.cpp
intelgpu_op_batchnorm.cpp intelgpu_op_batchnorm.cpp
intelgpu_op_broadcast.cpp intelgpu_op_broadcast.cpp
intelgpu_op_custom_kernels.cpp intelgpu_op_custom_kernels.cpp
......
...@@ -50,10 +50,10 @@ ...@@ -50,10 +50,10 @@
#include "ngraph/pass/reshape_elimination.hpp" #include "ngraph/pass/reshape_elimination.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_backend.hpp" #include "ngraph/runtime/intelgpu/intelgpu_backend.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_executable.hpp" #include "ngraph/runtime/intelgpu/intelgpu_executable.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_kernels.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp" #include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp" #include "ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp" #include "ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_convolution.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_custom_func_call.hpp" #include "ngraph/runtime/intelgpu/intelgpu_op_custom_func_call.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp" #include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_softmax.hpp" #include "ngraph/runtime/intelgpu/intelgpu_op_softmax.hpp"
...@@ -89,6 +89,7 @@ ...@@ -89,6 +89,7 @@
#include "ngraph/op/reshape.hpp" #include "ngraph/op/reshape.hpp"
#include "ngraph/op/reverse.hpp" #include "ngraph/op/reverse.hpp"
#include "ngraph/op/reverse_sequence.hpp" #include "ngraph/op/reverse_sequence.hpp"
#include "ngraph/op/select.hpp"
#include "ngraph/op/slice.hpp" #include "ngraph/op/slice.hpp"
#include "ngraph/op/softmax.hpp" #include "ngraph/op/softmax.hpp"
#include "ngraph/op/sum.hpp" #include "ngraph/op/sum.hpp"
...@@ -391,6 +392,7 @@ shared_ptr<runtime::Executable> ...@@ -391,6 +392,7 @@ shared_ptr<runtime::Executable>
set<cldnn::primitive_id> func_output_names; set<cldnn::primitive_id> func_output_names;
cldnn::topology topology; cldnn::topology topology;
CustomKernels kern(topology);
stopwatch timer_compile; stopwatch timer_compile;
double consumed_memory = 0.0; double consumed_memory = 0.0;
double compilation_time = 0.0; double compilation_time = 0.0;
...@@ -487,15 +489,7 @@ shared_ptr<runtime::Executable> ...@@ -487,15 +489,7 @@ shared_ptr<runtime::Executable>
} }
else else
{ {
do_slice_operation(topology, kern.emit<op::Slice>(elem);
op->get_input_tensor_name(0),
op->get_input_shape(0),
op->get_output_tensor_name(0),
op->get_output_shape(0),
op->get_output_element_type(0),
lower_bounds,
upper_bounds,
strides);
} }
break; break;
} }
...@@ -505,16 +499,7 @@ shared_ptr<runtime::Executable> ...@@ -505,16 +499,7 @@ shared_ptr<runtime::Executable>
if (op->get_output_element_type(0) != element::f32) if (op->get_output_element_type(0) != element::f32)
{ {
do_select_operation(topology, kern.emit<op::Select>(static_pointer_cast<op::Select>(op));
op->get_input_tensor_name(0),
op->get_input_shape(0),
op->get_input_tensor_name(1),
op->get_input_shape(1),
op->get_input_tensor_name(2),
op->get_input_shape(2),
op->get_output_tensor_name(0),
op->get_output_shape(0),
op->get_output_element_type(0));
} }
else else
{ {
...@@ -1605,25 +1590,7 @@ shared_ptr<runtime::Executable> ...@@ -1605,25 +1590,7 @@ shared_ptr<runtime::Executable>
(data_dilation.at(0) != 1) || (data_dilation.at(1) != 1) || (data_dilation.at(0) != 1) || (data_dilation.at(1) != 1) ||
(op->get_output_element_type(0) != element::f32)) (op->get_output_element_type(0) != element::f32))
{ {
do_convolution_operation(topology, kern.emit<op::Convolution>(conv_op);
op->get_input_tensor_name(0),
op->get_input_shape(0),
op->get_input_tensor_name(1),
op->get_input_shape(1),
op->get_output_tensor_name(0),
op->get_output_shape(0),
op->get_output_element_type(0),
conv_op->get_padding_below(),
conv_op->get_window_movement_strides(),
conv_op->get_window_dilation_strides(),
conv_op->get_data_dilation_strides(),
0,
1,
1,
"input[batch][input_channel]",
"filter[output_channel][input_channel]",
"output[batch][output_channel]",
false);
} }
else else
{ {
...@@ -1691,25 +1658,7 @@ shared_ptr<runtime::Executable> ...@@ -1691,25 +1658,7 @@ shared_ptr<runtime::Executable>
(win_dilation.size() != 2) || (op->get_output_element_type(0) != element::f32) || (win_dilation.size() != 2) || (op->get_output_element_type(0) != element::f32) ||
proceed_with_custom_kernel) proceed_with_custom_kernel)
{ {
do_convolution_operation(topology, kern.emit<op::ConvolutionBackpropFilters>(conv_op);
op->get_input_tensor_name(0),
op->get_input_shape(0),
op->get_input_tensor_name(1),
op->get_input_shape(1),
op->get_output_tensor_name(0),
op->get_output_shape(0),
op->get_output_element_type(0),
conv_op->get_padding_below_forward(),
win_stride,
win_dilation,
data_dilation,
1,
0,
0,
"input[input_channel][batch]",
"filter[input_channel][output_channel]",
"output[output_channel][batch]",
false);
} }
else else
{ {
...@@ -1793,25 +1742,7 @@ shared_ptr<runtime::Executable> ...@@ -1793,25 +1742,7 @@ shared_ptr<runtime::Executable>
(win_dilation.at(1) != 1) || (op->get_output_element_type(0) != element::f32) || (win_dilation.at(1) != 1) || (op->get_output_element_type(0) != element::f32) ||
((pad_below.at(0) == pad_above.at(0)) && (pad_below.at(1) == pad_above.at(1)))) ((pad_below.at(0) == pad_above.at(0)) && (pad_below.at(1) == pad_above.at(1))))
{ {
do_convolution_operation(topology, kern.emit<op::ConvolutionBackpropData>(conv_op);
op->get_input_tensor_name(1),
op->get_input_shape(1),
op->get_input_tensor_name(0),
op->get_input_shape(0),
op->get_output_tensor_name(0),
op->get_output_shape(0),
op->get_output_element_type(0),
pad_below,
win_stride,
win_dilation,
data_dilation,
0,
1,
1,
"input[batch][input_channel]",
"filter[input_channel][output_channel]",
"output[batch][output_channel]",
true);
} }
else else
{ {
......
...@@ -14,42 +14,33 @@ ...@@ -14,42 +14,33 @@
// limitations under the License. // limitations under the License.
//***************************************************************************** //*****************************************************************************
#pragma once #include <CPP/custom_gpu_primitive.hpp>
#include <CPP/topology.hpp> #include "ngraph/runtime/intelgpu/intelgpu_kernels.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
#include "ngraph/coordinate_diff.hpp" #include "ngraph/node.hpp"
#include "ngraph/shape.hpp"
#include "ngraph/strides.hpp"
#include "ngraph/type/element_type.hpp"
namespace ngraph using namespace std;
using namespace ngraph;
void runtime::intelgpu::CustomKernels::queue_krnl(const krnl_info& krnl_info,
const shared_ptr<Node>& op)
{ {
namespace runtime for (const auto& kr : krnl_info)
{ {
namespace intelgpu const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(kr.m_type, kr.m_shape);
{
// This implements Convolution nGraph operation const cldnn::custom_gpu_primitive kernel_item(kr.m_name,
// nGraph uses channels in this operation but clDNN uses full input data kr.m_inputs,
void do_convolution_operation(cldnn::topology& topology, {kr.m_code},
const std::string& input_name, kr.m_entry_point,
const Shape& input_shape, get_kernel_args(kr.m_inputs.size(), 1),
const std::string& filter_name, "",
const Shape& filter_shape, layout,
const std::string& output_name, kr.m_gws,
const Shape& output_shape, kr.m_lws);
const element::Type& output_type, stream.add(kernel_item);
const CoordinateDiff& pad_below,
const Strides& win_stride,
const Strides& win_dilation,
const Strides& data_dilation,
size_t batch_axis_data,
size_t input_channel_axis_data,
size_t output_channel_axis_result,
const std::string& input_order,
const std::string& filter_order,
const std::string& output_order,
bool reverse_filter);
}
} }
} }
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <memory>
#include <string>
#include <vector>
#include <CPP/topology.hpp>
#include "ngraph/node.hpp"
#include "ngraph/op/convolution.hpp"
#include "ngraph/op/select.hpp"
#include "ngraph/op/slice.hpp"
namespace ngraph
{
namespace runtime
{
namespace intelgpu
{
class CustomKernelInfo;
class CustomKernels;
}
}
}
class ngraph::runtime::intelgpu::CustomKernelInfo
{
public:
CustomKernelInfo(const std::string& name,
const Shape& shape,
const element::Type& type,
const std::vector<std::string>& inputs,
const std::string& code,
const std::string& entry_point,
const std::vector<size_t>& gws = {1},
const std::vector<size_t>& lws = {1})
{
m_name = name;
m_shape = shape;
m_type = type;
m_inputs = inputs;
m_code = code;
m_entry_point = entry_point;
m_gws = gws;
m_lws = lws;
}
std::string m_name;
Shape m_shape;
element::Type m_type;
std::vector<std::string> m_inputs;
std::string m_code;
std::string m_entry_point;
std::vector<size_t> m_gws;
std::vector<size_t> m_lws;
};
class ngraph::runtime::intelgpu::CustomKernels
{
public:
using krnl_info = std::vector<CustomKernelInfo>;
explicit CustomKernels(cldnn::topology& backend_stream)
: stream(backend_stream)
{
m_count_krnls = 0;
}
template <typename OP>
void emit(const std::shared_ptr<OP>& op)
{
krnl_info krnl_info;
krnl_info = build_krnl(op);
queue_krnl(krnl_info, op);
++m_count_krnls;
}
size_t get_custom_kernel_count() const { return m_count_krnls; }
private:
void queue_krnl(const krnl_info& krn_info, const std::shared_ptr<Node>& op);
krnl_info build_krnl(const std::shared_ptr<op::Convolution>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::ConvolutionBackpropData>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::ConvolutionBackpropFilters>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::Select>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::Slice>& op) const;
cldnn::topology& stream;
size_t m_count_krnls;
};
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include <CPP/custom_gpu_primitive.hpp> #include <CPP/custom_gpu_primitive.hpp>
#include <CPP/reshape.hpp> #include <CPP/reshape.hpp>
#include "ngraph/runtime/intelgpu/intelgpu_kernels.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp" #include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp" #include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
...@@ -28,6 +29,7 @@ ...@@ -28,6 +29,7 @@
using namespace std; using namespace std;
using namespace ngraph; using namespace ngraph;
using namespace ngraph::runtime::intelgpu;
string runtime::intelgpu::get_opencl_type_name(const element::Type& ngraph_type) string runtime::intelgpu::get_opencl_type_name(const element::Type& ngraph_type)
{ {
...@@ -1036,17 +1038,16 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology, ...@@ -1036,17 +1038,16 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology,
topology.add(op_dot); topology.add(op_dot);
} }
void runtime::intelgpu::do_slice_operation(cldnn::topology& topology, CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::Slice>& op) const
const string& input_name,
const Shape& input_shape,
const string& output_name,
const Shape& output_shape,
const element::Type& output_type,
const Coordinate& lower_bounds,
const Coordinate& uppper_bounds,
const Strides& strides)
{ {
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape); const string& input_name = op->get_input_tensor_name(0);
const Shape& input_shape = op->get_input_shape(0);
const string& output_name = op->get_output_tensor_name(0);
const Shape& output_shape = op->get_output_shape(0);
const element::Type& output_type = op->get_output_element_type(0);
const Coordinate& lower_bounds = op->get_lower_bounds();
const Coordinate& uppper_bounds = op->get_upper_bounds();
const Strides& strides = op->get_strides();
const string entry_point_name = "slice_" + output_name; const string entry_point_name = "slice_" + output_name;
CodeWriter writer; CodeWriter writer;
vector<size_t> gws; vector<size_t> gws;
...@@ -1071,15 +1072,14 @@ void runtime::intelgpu::do_slice_operation(cldnn::topology& topology, ...@@ -1071,15 +1072,14 @@ void runtime::intelgpu::do_slice_operation(cldnn::topology& topology,
} }
writer.block_end(); writer.block_end();
const cldnn::custom_gpu_primitive op_slice(output_name, const CustomKernelInfo krn_ret(output_name,
{input_name}, output_shape,
{writer.get_code()}, output_type,
entry_point_name, {input_name},
get_kernel_args(1, 1), {writer.get_code()},
"", entry_point_name,
layout, gws);
gws); return {krn_ret};
topology.add(op_slice);
} }
void runtime::intelgpu::do_concat_operation(cldnn::topology& topology, void runtime::intelgpu::do_concat_operation(cldnn::topology& topology,
...@@ -1225,18 +1225,17 @@ void runtime::intelgpu::do_concat_operation(cldnn::topology& topology, ...@@ -1225,18 +1225,17 @@ void runtime::intelgpu::do_concat_operation(cldnn::topology& topology,
} }
} }
void runtime::intelgpu::do_select_operation(cldnn::topology& topology, CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::Select>& op) const
const string& input0_name,
const Shape& input0_shape,
const string& input1_name,
const Shape& input1_shape,
const string& input2_name,
const Shape& input2_shape,
const string& output_name,
const Shape& output_shape,
const element::Type& output_type)
{ {
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape); const string& input0_name = op->get_input_tensor_name(0);
const Shape& input0_shape = op->get_input_shape(0);
const string& input1_name = op->get_input_tensor_name(1);
const Shape& input1_shape = op->get_input_shape(1);
const string& input2_name = op->get_input_tensor_name(2);
const Shape& input2_shape = op->get_input_shape(2);
const string& output_name = op->get_output_tensor_name(0);
const Shape& output_shape = op->get_output_shape(0);
const element::Type& output_type = op->get_output_element_type(0);
const string entry_point_name = "select_" + output_name; const string entry_point_name = "select_" + output_name;
CodeWriter writer; CodeWriter writer;
vector<size_t> gws; vector<size_t> gws;
...@@ -1262,15 +1261,14 @@ void runtime::intelgpu::do_select_operation(cldnn::topology& topology, ...@@ -1262,15 +1261,14 @@ void runtime::intelgpu::do_select_operation(cldnn::topology& topology,
} }
writer.block_end(); writer.block_end();
const cldnn::custom_gpu_primitive op_select(output_name, const CustomKernelInfo krn_ret(output_name,
{input0_name, input1_name, input2_name}, output_shape,
{writer.get_code()}, output_type,
entry_point_name, {input0_name, input1_name, input2_name},
get_kernel_args(3, 1), {writer.get_code()},
"", entry_point_name,
layout, gws);
gws); return {krn_ret};
topology.add(op_select);
} }
void runtime::intelgpu::do_logic_kernel(cldnn::topology& topology, void runtime::intelgpu::do_logic_kernel(cldnn::topology& topology,
......
...@@ -92,16 +92,6 @@ namespace ngraph ...@@ -92,16 +92,6 @@ namespace ngraph
const element::Type& output_type, const element::Type& output_type,
size_t reduction_axes_count); size_t reduction_axes_count);
void do_slice_operation(cldnn::topology& topology,
const std::string& input_name,
const Shape& input_shape,
const std::string& output_name,
const Shape& output_shape,
const element::Type& output_type,
const Coordinate& lower_bounds,
const Coordinate& uppper_bounds,
const Strides& strides);
void do_concat_operation(cldnn::topology& topology, void do_concat_operation(cldnn::topology& topology,
const std::vector<std::string>& input_names, const std::vector<std::string>& input_names,
const std::vector<Shape>& input_shapes, const std::vector<Shape>& input_shapes,
...@@ -110,17 +100,6 @@ namespace ngraph ...@@ -110,17 +100,6 @@ namespace ngraph
const element::Type& output_type, const element::Type& output_type,
size_t concat_axis); size_t concat_axis);
void do_select_operation(cldnn::topology& topology,
const std::string& input0_name,
const Shape& input0_shape,
const std::string& input1_name,
const Shape& input1_shape,
const std::string& input2_name,
const Shape& input2_shape,
const std::string& output_name,
const Shape& output_shape,
const element::Type& output_type);
void do_logic_kernel(cldnn::topology& topology, void do_logic_kernel(cldnn::topology& topology,
const std::string& input0_name, const std::string& input0_name,
const Shape& input0_shape, const Shape& input0_shape,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment