Commit 86641478 authored by Sergey Shalnov's avatar Sergey Shalnov Committed by Scott Cyphers

IntelGPU backend: Custom kernels refactoring (#2757)

* IntelGPU backend: Custom kernels refactoring

* IntelGPU backend: remove unused header
parent d46330de
......@@ -19,6 +19,7 @@ set(SRC
intelgpu_executable.cpp
intelgpu_tensor_view.cpp
intelgpu_layout.cpp
intelgpu_kernels.cpp
intelgpu_op_batchnorm.cpp
intelgpu_op_broadcast.cpp
intelgpu_op_custom_kernels.cpp
......
......@@ -50,10 +50,10 @@
#include "ngraph/pass/reshape_elimination.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_backend.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_executable.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_kernels.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_convolution.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_custom_func_call.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_softmax.hpp"
......@@ -89,6 +89,7 @@
#include "ngraph/op/reshape.hpp"
#include "ngraph/op/reverse.hpp"
#include "ngraph/op/reverse_sequence.hpp"
#include "ngraph/op/select.hpp"
#include "ngraph/op/slice.hpp"
#include "ngraph/op/softmax.hpp"
#include "ngraph/op/sum.hpp"
......@@ -391,6 +392,7 @@ shared_ptr<runtime::Executable>
set<cldnn::primitive_id> func_output_names;
cldnn::topology topology;
CustomKernels kern(topology);
stopwatch timer_compile;
double consumed_memory = 0.0;
double compilation_time = 0.0;
......@@ -487,15 +489,7 @@ shared_ptr<runtime::Executable>
}
else
{
do_slice_operation(topology,
op->get_input_tensor_name(0),
op->get_input_shape(0),
op->get_output_tensor_name(0),
op->get_output_shape(0),
op->get_output_element_type(0),
lower_bounds,
upper_bounds,
strides);
kern.emit<op::Slice>(elem);
}
break;
}
......@@ -505,16 +499,7 @@ shared_ptr<runtime::Executable>
if (op->get_output_element_type(0) != element::f32)
{
do_select_operation(topology,
op->get_input_tensor_name(0),
op->get_input_shape(0),
op->get_input_tensor_name(1),
op->get_input_shape(1),
op->get_input_tensor_name(2),
op->get_input_shape(2),
op->get_output_tensor_name(0),
op->get_output_shape(0),
op->get_output_element_type(0));
kern.emit<op::Select>(static_pointer_cast<op::Select>(op));
}
else
{
......@@ -1605,25 +1590,7 @@ shared_ptr<runtime::Executable>
(data_dilation.at(0) != 1) || (data_dilation.at(1) != 1) ||
(op->get_output_element_type(0) != element::f32))
{
do_convolution_operation(topology,
op->get_input_tensor_name(0),
op->get_input_shape(0),
op->get_input_tensor_name(1),
op->get_input_shape(1),
op->get_output_tensor_name(0),
op->get_output_shape(0),
op->get_output_element_type(0),
conv_op->get_padding_below(),
conv_op->get_window_movement_strides(),
conv_op->get_window_dilation_strides(),
conv_op->get_data_dilation_strides(),
0,
1,
1,
"input[batch][input_channel]",
"filter[output_channel][input_channel]",
"output[batch][output_channel]",
false);
kern.emit<op::Convolution>(conv_op);
}
else
{
......@@ -1691,25 +1658,7 @@ shared_ptr<runtime::Executable>
(win_dilation.size() != 2) || (op->get_output_element_type(0) != element::f32) ||
proceed_with_custom_kernel)
{
do_convolution_operation(topology,
op->get_input_tensor_name(0),
op->get_input_shape(0),
op->get_input_tensor_name(1),
op->get_input_shape(1),
op->get_output_tensor_name(0),
op->get_output_shape(0),
op->get_output_element_type(0),
conv_op->get_padding_below_forward(),
win_stride,
win_dilation,
data_dilation,
1,
0,
0,
"input[input_channel][batch]",
"filter[input_channel][output_channel]",
"output[output_channel][batch]",
false);
kern.emit<op::ConvolutionBackpropFilters>(conv_op);
}
else
{
......@@ -1793,25 +1742,7 @@ shared_ptr<runtime::Executable>
(win_dilation.at(1) != 1) || (op->get_output_element_type(0) != element::f32) ||
((pad_below.at(0) == pad_above.at(0)) && (pad_below.at(1) == pad_above.at(1))))
{
do_convolution_operation(topology,
op->get_input_tensor_name(1),
op->get_input_shape(1),
op->get_input_tensor_name(0),
op->get_input_shape(0),
op->get_output_tensor_name(0),
op->get_output_shape(0),
op->get_output_element_type(0),
pad_below,
win_stride,
win_dilation,
data_dilation,
0,
1,
1,
"input[batch][input_channel]",
"filter[input_channel][output_channel]",
"output[batch][output_channel]",
true);
kern.emit<op::ConvolutionBackpropData>(conv_op);
}
else
{
......
......@@ -14,42 +14,33 @@
// limitations under the License.
//*****************************************************************************
#pragma once
#include <CPP/custom_gpu_primitive.hpp>
#include <CPP/topology.hpp>
#include "ngraph/runtime/intelgpu/intelgpu_kernels.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
#include "ngraph/coordinate_diff.hpp"
#include "ngraph/shape.hpp"
#include "ngraph/strides.hpp"
#include "ngraph/type/element_type.hpp"
#include "ngraph/node.hpp"
namespace ngraph
using namespace std;
using namespace ngraph;
void runtime::intelgpu::CustomKernels::queue_krnl(const krnl_info& krnl_info,
const shared_ptr<Node>& op)
{
namespace runtime
for (const auto& kr : krnl_info)
{
namespace intelgpu
{
// This implements Convolution nGraph operation
// nGraph uses channels in this operation but clDNN uses full input data
void do_convolution_operation(cldnn::topology& topology,
const std::string& input_name,
const Shape& input_shape,
const std::string& filter_name,
const Shape& filter_shape,
const std::string& output_name,
const Shape& output_shape,
const element::Type& output_type,
const CoordinateDiff& pad_below,
const Strides& win_stride,
const Strides& win_dilation,
const Strides& data_dilation,
size_t batch_axis_data,
size_t input_channel_axis_data,
size_t output_channel_axis_result,
const std::string& input_order,
const std::string& filter_order,
const std::string& output_order,
bool reverse_filter);
}
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(kr.m_type, kr.m_shape);
const cldnn::custom_gpu_primitive kernel_item(kr.m_name,
kr.m_inputs,
{kr.m_code},
kr.m_entry_point,
get_kernel_args(kr.m_inputs.size(), 1),
"",
layout,
kr.m_gws,
kr.m_lws);
stream.add(kernel_item);
}
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <memory>
#include <string>
#include <vector>
#include <CPP/topology.hpp>
#include "ngraph/node.hpp"
#include "ngraph/op/convolution.hpp"
#include "ngraph/op/select.hpp"
#include "ngraph/op/slice.hpp"
namespace ngraph
{
namespace runtime
{
namespace intelgpu
{
class CustomKernelInfo;
class CustomKernels;
}
}
}
class ngraph::runtime::intelgpu::CustomKernelInfo
{
public:
CustomKernelInfo(const std::string& name,
const Shape& shape,
const element::Type& type,
const std::vector<std::string>& inputs,
const std::string& code,
const std::string& entry_point,
const std::vector<size_t>& gws = {1},
const std::vector<size_t>& lws = {1})
{
m_name = name;
m_shape = shape;
m_type = type;
m_inputs = inputs;
m_code = code;
m_entry_point = entry_point;
m_gws = gws;
m_lws = lws;
}
std::string m_name;
Shape m_shape;
element::Type m_type;
std::vector<std::string> m_inputs;
std::string m_code;
std::string m_entry_point;
std::vector<size_t> m_gws;
std::vector<size_t> m_lws;
};
class ngraph::runtime::intelgpu::CustomKernels
{
public:
using krnl_info = std::vector<CustomKernelInfo>;
explicit CustomKernels(cldnn::topology& backend_stream)
: stream(backend_stream)
{
m_count_krnls = 0;
}
template <typename OP>
void emit(const std::shared_ptr<OP>& op)
{
krnl_info krnl_info;
krnl_info = build_krnl(op);
queue_krnl(krnl_info, op);
++m_count_krnls;
}
size_t get_custom_kernel_count() const { return m_count_krnls; }
private:
void queue_krnl(const krnl_info& krn_info, const std::shared_ptr<Node>& op);
krnl_info build_krnl(const std::shared_ptr<op::Convolution>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::ConvolutionBackpropData>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::ConvolutionBackpropFilters>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::Select>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::Slice>& op) const;
cldnn::topology& stream;
size_t m_count_krnls;
};
......@@ -21,6 +21,7 @@
#include <CPP/custom_gpu_primitive.hpp>
#include <CPP/reshape.hpp>
#include "ngraph/runtime/intelgpu/intelgpu_kernels.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
......@@ -28,6 +29,7 @@
using namespace std;
using namespace ngraph;
using namespace ngraph::runtime::intelgpu;
string runtime::intelgpu::get_opencl_type_name(const element::Type& ngraph_type)
{
......@@ -1036,17 +1038,16 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology,
topology.add(op_dot);
}
void runtime::intelgpu::do_slice_operation(cldnn::topology& topology,
const string& input_name,
const Shape& input_shape,
const string& output_name,
const Shape& output_shape,
const element::Type& output_type,
const Coordinate& lower_bounds,
const Coordinate& uppper_bounds,
const Strides& strides)
CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::Slice>& op) const
{
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const string& input_name = op->get_input_tensor_name(0);
const Shape& input_shape = op->get_input_shape(0);
const string& output_name = op->get_output_tensor_name(0);
const Shape& output_shape = op->get_output_shape(0);
const element::Type& output_type = op->get_output_element_type(0);
const Coordinate& lower_bounds = op->get_lower_bounds();
const Coordinate& uppper_bounds = op->get_upper_bounds();
const Strides& strides = op->get_strides();
const string entry_point_name = "slice_" + output_name;
CodeWriter writer;
vector<size_t> gws;
......@@ -1071,15 +1072,14 @@ void runtime::intelgpu::do_slice_operation(cldnn::topology& topology,
}
writer.block_end();
const cldnn::custom_gpu_primitive op_slice(output_name,
{input_name},
{writer.get_code()},
entry_point_name,
get_kernel_args(1, 1),
"",
layout,
gws);
topology.add(op_slice);
const CustomKernelInfo krn_ret(output_name,
output_shape,
output_type,
{input_name},
{writer.get_code()},
entry_point_name,
gws);
return {krn_ret};
}
void runtime::intelgpu::do_concat_operation(cldnn::topology& topology,
......@@ -1225,18 +1225,17 @@ void runtime::intelgpu::do_concat_operation(cldnn::topology& topology,
}
}
void runtime::intelgpu::do_select_operation(cldnn::topology& topology,
const string& input0_name,
const Shape& input0_shape,
const string& input1_name,
const Shape& input1_shape,
const string& input2_name,
const Shape& input2_shape,
const string& output_name,
const Shape& output_shape,
const element::Type& output_type)
CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::Select>& op) const
{
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const string& input0_name = op->get_input_tensor_name(0);
const Shape& input0_shape = op->get_input_shape(0);
const string& input1_name = op->get_input_tensor_name(1);
const Shape& input1_shape = op->get_input_shape(1);
const string& input2_name = op->get_input_tensor_name(2);
const Shape& input2_shape = op->get_input_shape(2);
const string& output_name = op->get_output_tensor_name(0);
const Shape& output_shape = op->get_output_shape(0);
const element::Type& output_type = op->get_output_element_type(0);
const string entry_point_name = "select_" + output_name;
CodeWriter writer;
vector<size_t> gws;
......@@ -1262,15 +1261,14 @@ void runtime::intelgpu::do_select_operation(cldnn::topology& topology,
}
writer.block_end();
const cldnn::custom_gpu_primitive op_select(output_name,
{input0_name, input1_name, input2_name},
{writer.get_code()},
entry_point_name,
get_kernel_args(3, 1),
"",
layout,
gws);
topology.add(op_select);
const CustomKernelInfo krn_ret(output_name,
output_shape,
output_type,
{input0_name, input1_name, input2_name},
{writer.get_code()},
entry_point_name,
gws);
return {krn_ret};
}
void runtime::intelgpu::do_logic_kernel(cldnn::topology& topology,
......
......@@ -92,16 +92,6 @@ namespace ngraph
const element::Type& output_type,
size_t reduction_axes_count);
void do_slice_operation(cldnn::topology& topology,
const std::string& input_name,
const Shape& input_shape,
const std::string& output_name,
const Shape& output_shape,
const element::Type& output_type,
const Coordinate& lower_bounds,
const Coordinate& uppper_bounds,
const Strides& strides);
void do_concat_operation(cldnn::topology& topology,
const std::vector<std::string>& input_names,
const std::vector<Shape>& input_shapes,
......@@ -110,17 +100,6 @@ namespace ngraph
const element::Type& output_type,
size_t concat_axis);
void do_select_operation(cldnn::topology& topology,
const std::string& input0_name,
const Shape& input0_shape,
const std::string& input1_name,
const Shape& input1_shape,
const std::string& input2_name,
const Shape& input2_shape,
const std::string& output_name,
const Shape& output_shape,
const element::Type& output_type);
void do_logic_kernel(cldnn::topology& topology,
const std::string& input0_name,
const Shape& input0_shape,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment