Commit 67248fdb authored by Sergey Shalnov's avatar Sergey Shalnov Committed by Robert Kimball

IntelGPU backend: Custom kernels refactoring 3 (#2787)

parent 2b13ae40
...@@ -52,7 +52,6 @@ ...@@ -52,7 +52,6 @@
#include "ngraph/runtime/intelgpu/intelgpu_executable.hpp" #include "ngraph/runtime/intelgpu/intelgpu_executable.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_kernels.hpp" #include "ngraph/runtime/intelgpu/intelgpu_kernels.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp" #include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp" #include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_tensor_view.hpp" #include "ngraph/runtime/intelgpu/intelgpu_tensor_view.hpp"
#include "ngraph/runtime/intelgpu/visualize_tree.hpp" #include "ngraph/runtime/intelgpu/visualize_tree.hpp"
...@@ -61,6 +60,7 @@ ...@@ -61,6 +60,7 @@
#include "ngraph/function.hpp" #include "ngraph/function.hpp"
#include "ngraph/node.hpp" #include "ngraph/node.hpp"
#include "ngraph/op/all.hpp" #include "ngraph/op/all.hpp"
#include "ngraph/op/and.hpp"
#include "ngraph/op/any.hpp" #include "ngraph/op/any.hpp"
#include "ngraph/op/argmax.hpp" #include "ngraph/op/argmax.hpp"
#include "ngraph/op/argmin.hpp" #include "ngraph/op/argmin.hpp"
...@@ -73,13 +73,20 @@ ...@@ -73,13 +73,20 @@
#include "ngraph/op/dequantize.hpp" #include "ngraph/op/dequantize.hpp"
#include "ngraph/op/dot.hpp" #include "ngraph/op/dot.hpp"
#include "ngraph/op/embedding_lookup.hpp" #include "ngraph/op/embedding_lookup.hpp"
#include "ngraph/op/equal.hpp"
#include "ngraph/op/erf.hpp" #include "ngraph/op/erf.hpp"
#include "ngraph/op/get_output_element.hpp" #include "ngraph/op/get_output_element.hpp"
#include "ngraph/op/greater.hpp"
#include "ngraph/op/greater_eq.hpp"
#include "ngraph/op/less.hpp"
#include "ngraph/op/less_eq.hpp"
#include "ngraph/op/lrn.hpp" #include "ngraph/op/lrn.hpp"
#include "ngraph/op/max.hpp" #include "ngraph/op/max.hpp"
#include "ngraph/op/max_pool.hpp" #include "ngraph/op/max_pool.hpp"
#include "ngraph/op/min.hpp" #include "ngraph/op/min.hpp"
#include "ngraph/op/not_equal.hpp"
#include "ngraph/op/one_hot.hpp" #include "ngraph/op/one_hot.hpp"
#include "ngraph/op/or.hpp"
#include "ngraph/op/pad.hpp" #include "ngraph/op/pad.hpp"
#include "ngraph/op/product.hpp" #include "ngraph/op/product.hpp"
#include "ngraph/op/quantize.hpp" #include "ngraph/op/quantize.hpp"
...@@ -129,25 +136,13 @@ static OP_TYPEID get_typeid(const string& s) ...@@ -129,25 +136,13 @@ static OP_TYPEID get_typeid(const string& s)
return it->second; return it->second;
} }
static void arguments_check(const shared_ptr<Node>& op, size_t input, size_t output)
{
if (op->get_input_size() != input || op->get_output_size() != output)
{
ostringstream os;
os << "Operation \"" << op->description() << "\" input and output sizes mismatch."
<< " Expected input size=" << input << ", provided=" << op->get_input_size()
<< ". Expected output size=" << output << ", provided=" << op->get_output_size();
throw invalid_argument(os.str());
}
}
static void do_eltwise_operation(cldnn::topology& topology, static void do_eltwise_operation(cldnn::topology& topology,
const shared_ptr<Node>& op, const shared_ptr<Node>& op,
const string& custom_op, const string& custom_op,
bool function_operation, bool function_operation,
cldnn::eltwise_mode mode) cldnn::eltwise_mode mode)
{ {
arguments_check(op, 2, 1); runtime::intelgpu::arguments_check(op, 2, 1);
if (op->get_input_element_type(0) != element::f32 || if (op->get_input_element_type(0) != element::f32 ||
op->get_input_element_type(1) != element::f32 || op->get_input_element_type(1) != element::f32 ||
...@@ -180,7 +175,7 @@ static void do_cldnn_unary(cldnn::topology& topology, ...@@ -180,7 +175,7 @@ static void do_cldnn_unary(cldnn::topology& topology,
cldnn_activation_func mode, cldnn_activation_func mode,
const cldnn_activation_additional_params& param = {0.f, 0.f}) const cldnn_activation_additional_params& param = {0.f, 0.f})
{ {
arguments_check(op, 1, 1); runtime::intelgpu::arguments_check(op, 1, 1);
const cldnn::activation cldnn_unary( const cldnn::activation cldnn_unary(
op->get_output_tensor_name(0), op->get_input_tensor_name(0), mode, param); op->get_output_tensor_name(0), op->get_input_tensor_name(0), mode, param);
...@@ -190,7 +185,7 @@ static void do_cldnn_unary(cldnn::topology& topology, ...@@ -190,7 +185,7 @@ static void do_cldnn_unary(cldnn::topology& topology,
static void static void
do_custom_unary(cldnn::topology& topology, const shared_ptr<Node>& op, const string& operation) do_custom_unary(cldnn::topology& topology, const shared_ptr<Node>& op, const string& operation)
{ {
arguments_check(op, 1, 1); runtime::intelgpu::arguments_check(op, 1, 1);
runtime::intelgpu::do_custom_unary_operation(topology, runtime::intelgpu::do_custom_unary_operation(topology,
op->get_input_tensor_name(0), op->get_input_tensor_name(0),
...@@ -209,7 +204,7 @@ static void do_universal_unary(cldnn::topology& topology, ...@@ -209,7 +204,7 @@ static void do_universal_unary(cldnn::topology& topology,
bool force_custom = false, bool force_custom = false,
const cldnn_activation_additional_params& param = {0.f, 0.f}) const cldnn_activation_additional_params& param = {0.f, 0.f})
{ {
arguments_check(op, 1, 1); runtime::intelgpu::arguments_check(op, 1, 1);
if (force_custom || (op->get_input_element_type(0) != element::f32)) if (force_custom || (op->get_input_element_type(0) != element::f32))
{ {
...@@ -228,7 +223,7 @@ static void do_pooling_operation(cldnn::topology& topology, ...@@ -228,7 +223,7 @@ static void do_pooling_operation(cldnn::topology& topology,
const Shape& pad_below, const Shape& pad_below,
const cldnn::pooling_mode mode) const cldnn::pooling_mode mode)
{ {
arguments_check(op, 1, 1); runtime::intelgpu::arguments_check(op, 1, 1);
const cldnn::tensor output_size = intelgpu_space::create_cldnn_tensor(op->get_output_shape(0)); const cldnn::tensor output_size = intelgpu_space::create_cldnn_tensor(op->get_output_shape(0));
const cldnn::tensor input_offset = intelgpu_space::create_cldnn_offset(pad_below); const cldnn::tensor input_offset = intelgpu_space::create_cldnn_offset(pad_below);
...@@ -245,22 +240,12 @@ static void do_pooling_operation(cldnn::topology& topology, ...@@ -245,22 +240,12 @@ static void do_pooling_operation(cldnn::topology& topology,
topology.add(cldnn_pooling); topology.add(cldnn_pooling);
} }
static void do_logical_operation(cldnn::topology& topology, template <typename OP>
const shared_ptr<Node>& op, static void do_logical_operation(runtime::intelgpu::CustomKernels& kern, const shared_ptr<Node>& op)
const string& operation)
{ {
arguments_check(op, 2, 1); runtime::intelgpu::arguments_check(op, 2, 1);
runtime::intelgpu::do_logic_kernel(topology, kern.emit<OP>(static_pointer_cast<OP>(op));
op->get_input_tensor_name(0),
op->get_input_shape(0),
op->get_input_element_type(0),
op->get_input_tensor_name(1),
op->get_input_shape(1),
op->get_output_tensor_name(0),
op->get_output_shape(0),
op->get_output_element_type(0),
operation);
} }
// This function needed to only change the name of the data in topology // This function needed to only change the name of the data in topology
...@@ -1246,42 +1231,42 @@ shared_ptr<runtime::Executable> ...@@ -1246,42 +1231,42 @@ shared_ptr<runtime::Executable>
} }
case OP_TYPEID::Greater: case OP_TYPEID::Greater:
{ {
do_logical_operation(topology, op, " > "); do_logical_operation<op::Greater>(kern, op);
break; break;
} }
case OP_TYPEID::GreaterEq: case OP_TYPEID::GreaterEq:
{ {
do_logical_operation(topology, op, " >= "); do_logical_operation<op::GreaterEq>(kern, op);
break; break;
} }
case OP_TYPEID::Equal: case OP_TYPEID::Equal:
{ {
do_logical_operation(topology, op, " == "); do_logical_operation<op::Equal>(kern, op);
break; break;
} }
case OP_TYPEID::NotEqual: case OP_TYPEID::NotEqual:
{ {
do_logical_operation(topology, op, " != "); do_logical_operation<op::NotEqual>(kern, op);
break; break;
} }
case OP_TYPEID::Less: case OP_TYPEID::Less:
{ {
do_logical_operation(topology, op, " < "); do_logical_operation<op::Less>(kern, op);
break; break;
} }
case OP_TYPEID::LessEq: case OP_TYPEID::LessEq:
{ {
do_logical_operation(topology, op, " <= "); do_logical_operation<op::LessEq>(kern, op);
break; break;
} }
case OP_TYPEID::And: case OP_TYPEID::And:
{ {
do_logical_operation(topology, op, " && "); do_logical_operation<op::And>(kern, op);
break; break;
} }
case OP_TYPEID::Or: case OP_TYPEID::Or:
{ {
do_logical_operation(topology, op, " || "); do_logical_operation<op::Or>(kern, op);
break; break;
} }
case OP_TYPEID::Pad: case OP_TYPEID::Pad:
...@@ -1305,40 +1290,8 @@ shared_ptr<runtime::Executable> ...@@ -1305,40 +1290,8 @@ shared_ptr<runtime::Executable>
{ {
arguments_check(op, 6, 3); arguments_check(op, 6, 3);
const shared_ptr<op::BatchNormTrainingBackprop> batch_norm = kern.emit<op::BatchNormTrainingBackprop>(
static_pointer_cast<op::BatchNormTrainingBackprop>(op); static_pointer_cast<op::BatchNormTrainingBackprop>(op));
const double eps = batch_norm->get_eps_value();
do_create_mean(topology,
op->get_output_tensor_name(2), // d_beta
op->get_output_element_type(2),
op->get_input_tensor_name(5), // delta
op->get_input_shape(5),
true);
do_create_variance_back(topology,
op->get_output_tensor_name(1), // d_gamma
op->get_output_element_type(1),
eps,
op->get_input_tensor_name(2), // input
op->get_input_shape(2),
op->get_input_tensor_name(3), // gamma
op->get_input_tensor_name(4), // beta
op->get_input_tensor_name(5)); // delta
do_batch_norm_backprop_operation(topology,
op->get_input_shape(2),
op->get_input_element_type(2),
op->get_input_tensor_name(0),
op->get_input_tensor_name(1),
op->get_input_tensor_name(2),
op->get_input_tensor_name(3),
op->get_input_tensor_name(4),
op->get_input_tensor_name(5),
eps,
op->get_output_tensor_name(0),
op->get_output_tensor_name(1),
op->get_output_tensor_name(2));
break; break;
} }
case OP_TYPEID::BatchNormInference: case OP_TYPEID::BatchNormInference:
...@@ -1367,16 +1320,7 @@ shared_ptr<runtime::Executable> ...@@ -1367,16 +1320,7 @@ shared_ptr<runtime::Executable>
if (proceed_with_custom_kernel || (op->get_input_shape(2).size() != 4) || if (proceed_with_custom_kernel || (op->get_input_shape(2).size() != 4) ||
(op->get_input_element_type(0) != ngraph::element::f32)) (op->get_input_element_type(0) != ngraph::element::f32))
{ {
do_batch_norm_operation(topology, kern.emit<op::BatchNormInference>(bnorm);
op->get_output_tensor_name(0),
op->get_output_element_type(0),
eps,
op->get_input_tensor_name(2),
op->get_input_shape(2),
op->get_input_tensor_name(0),
op->get_input_tensor_name(1),
op->get_input_tensor_name(3),
op->get_input_tensor_name(4));
} }
else else
{ {
...@@ -1400,61 +1344,7 @@ shared_ptr<runtime::Executable> ...@@ -1400,61 +1344,7 @@ shared_ptr<runtime::Executable>
if ((op->get_input_shape(2).size() != 4) || if ((op->get_input_shape(2).size() != 4) ||
(op->get_input_element_type(0) != ngraph::element::f32)) (op->get_input_element_type(0) != ngraph::element::f32))
{ {
string mean_name; kern.emit<op::BatchNormTraining>(bnorm);
string variance_name;
if (op->get_inputs().size() < 3 || op->get_outputs().empty())
{
arguments_check(op, 3, 1); // throw exception in this case
}
if (op->get_outputs().size() == 3)
{
arguments_check(op, 3, 3);
mean_name = op->get_output_tensor_name(1);
variance_name = op->get_output_tensor_name(2);
do_create_mean(topology,
mean_name,
op->get_output_element_type(0),
op->get_input_tensor_name(2),
op->get_input_shape(2),
false);
do_create_variance(topology,
variance_name,
op->get_output_element_type(0),
op->get_input_tensor_name(2),
op->get_input_shape(2),
mean_name);
}
if (op->get_outputs().size() == 1 || op->get_outputs().size() == 3)
{
if (mean_name.empty() || variance_name.empty())
{
arguments_check(op, 5, 1);
mean_name = op->get_input_tensor_name(3);
variance_name = op->get_input_tensor_name(4);
}
do_batch_norm_operation(topology,
op->get_output_tensor_name(0),
op->get_output_element_type(0),
eps,
op->get_input_tensor_name(2),
op->get_input_shape(2),
op->get_input_tensor_name(0),
op->get_input_tensor_name(1),
mean_name,
variance_name);
}
else
{
arguments_check(op, 5, 1); // throw exception in this case
}
} }
else else
{ {
......
...@@ -44,3 +44,15 @@ void runtime::intelgpu::CustomKernels::queue_krnl(const krnl_info& krnl_info, ...@@ -44,3 +44,15 @@ void runtime::intelgpu::CustomKernels::queue_krnl(const krnl_info& krnl_info,
stream.add(kernel_item); stream.add(kernel_item);
} }
} }
void runtime::intelgpu::arguments_check(const shared_ptr<Node>& op, size_t input, size_t output)
{
if (op->get_input_size() != input || op->get_output_size() != output)
{
ostringstream os;
os << "Operation \"" << op->description() << "\" input and output sizes mismatch."
<< " Expected input size=" << input << ", provided=" << op->get_input_size()
<< ". Expected output size=" << output << ", provided=" << op->get_output_size();
throw invalid_argument(os.str());
}
}
...@@ -24,11 +24,20 @@ ...@@ -24,11 +24,20 @@
#include "ngraph/node.hpp" #include "ngraph/node.hpp"
#include "ngraph/op/all.hpp" #include "ngraph/op/all.hpp"
#include "ngraph/op/and.hpp"
#include "ngraph/op/any.hpp" #include "ngraph/op/any.hpp"
#include "ngraph/op/batch_norm.hpp"
#include "ngraph/op/broadcast.hpp" #include "ngraph/op/broadcast.hpp"
#include "ngraph/op/convolution.hpp" #include "ngraph/op/convolution.hpp"
#include "ngraph/op/equal.hpp"
#include "ngraph/op/greater.hpp"
#include "ngraph/op/greater_eq.hpp"
#include "ngraph/op/less.hpp"
#include "ngraph/op/less_eq.hpp"
#include "ngraph/op/max.hpp" #include "ngraph/op/max.hpp"
#include "ngraph/op/min.hpp" #include "ngraph/op/min.hpp"
#include "ngraph/op/not_equal.hpp"
#include "ngraph/op/or.hpp"
#include "ngraph/op/product.hpp" #include "ngraph/op/product.hpp"
#include "ngraph/op/select.hpp" #include "ngraph/op/select.hpp"
#include "ngraph/op/slice.hpp" #include "ngraph/op/slice.hpp"
...@@ -43,6 +52,8 @@ namespace ngraph ...@@ -43,6 +52,8 @@ namespace ngraph
{ {
class CustomKernelInfo; class CustomKernelInfo;
class CustomKernels; class CustomKernels;
void arguments_check(const std::shared_ptr<Node>& op, size_t input, size_t output);
} }
} }
} }
...@@ -107,13 +118,24 @@ private: ...@@ -107,13 +118,24 @@ private:
void queue_krnl(const krnl_info& krn_info, const std::shared_ptr<Node>& op); void queue_krnl(const krnl_info& krn_info, const std::shared_ptr<Node>& op);
krnl_info build_krnl(const std::shared_ptr<op::All>& op) const; krnl_info build_krnl(const std::shared_ptr<op::All>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::And>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::Any>& op) const; krnl_info build_krnl(const std::shared_ptr<op::Any>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::BatchNormInference>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::BatchNormTraining>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::BatchNormTrainingBackprop>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::Broadcast>& op) const; krnl_info build_krnl(const std::shared_ptr<op::Broadcast>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::Convolution>& op) const; krnl_info build_krnl(const std::shared_ptr<op::Convolution>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::ConvolutionBackpropData>& op) const; krnl_info build_krnl(const std::shared_ptr<op::ConvolutionBackpropData>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::ConvolutionBackpropFilters>& op) const; krnl_info build_krnl(const std::shared_ptr<op::ConvolutionBackpropFilters>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::Equal>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::Greater>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::GreaterEq>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::Less>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::LessEq>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::Max>& op) const; krnl_info build_krnl(const std::shared_ptr<op::Max>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::Min>& op) const; krnl_info build_krnl(const std::shared_ptr<op::Min>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::NotEqual>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::Or>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::Product>& op) const; krnl_info build_krnl(const std::shared_ptr<op::Product>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::Select>& op) const; krnl_info build_krnl(const std::shared_ptr<op::Select>& op) const;
krnl_info build_krnl(const std::shared_ptr<op::Slice>& op) const; krnl_info build_krnl(const std::shared_ptr<op::Slice>& op) const;
......
...@@ -14,21 +14,15 @@ ...@@ -14,21 +14,15 @@
// limitations under the License. // limitations under the License.
//***************************************************************************** //*****************************************************************************
#include <CPP/batch_norm.hpp>
#include <CPP/concatenation.hpp>
#include <CPP/custom_gpu_primitive.hpp>
#include <CPP/scale.hpp>
#include <CPP/split.hpp>
#include "ngraph/code_writer.hpp" #include "ngraph/code_writer.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp" #include "ngraph/runtime/intelgpu/intelgpu_kernels.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp" #include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
#include "ngraph/op/batch_norm.hpp" #include "ngraph/op/batch_norm.hpp"
using namespace std; using namespace std;
using namespace ngraph; using namespace ngraph;
using namespace ngraph::runtime::intelgpu;
// According to the documentation, input data channel is always being axis 1 // According to the documentation, input data channel is always being axis 1
// Assumed the second dimension from the left. Example {0, 1, 0, 0} or {0, 1} // Assumed the second dimension from the left. Example {0, 1, 0, 0} or {0, 1}
...@@ -39,9 +33,8 @@ static Shape get_channel_shape(const Shape& shape, const string& function_name) ...@@ -39,9 +33,8 @@ static Shape get_channel_shape(const Shape& shape, const string& function_name)
{ {
if (shape.size() < channel_axis + 1) if (shape.size() < channel_axis + 1)
{ {
const string err = "intelgpu::" + function_name + "() input_shape" + const string err = "intelgpu::" + function_name + "() input_shape" + array_dims(shape) +
runtime::intelgpu::array_dims(shape) + " should be at least " + " should be at least " + to_string(channel_axis + 1) + "D.";
to_string(channel_axis + 1) + "D.";
throw invalid_argument(err); throw invalid_argument(err);
} }
...@@ -53,15 +46,14 @@ static size_t get_idx_size(const Shape& shape, size_t pos) ...@@ -53,15 +46,14 @@ static size_t get_idx_size(const Shape& shape, size_t pos)
return accumulate(shape.cbegin() + pos, shape.cend(), 1, multiplies<size_t>()); return accumulate(shape.cbegin() + pos, shape.cend(), 1, multiplies<size_t>());
} }
void runtime::intelgpu::do_create_mean(cldnn::topology& topology, // This creates mean of the input matrix by Channel axis
const string& output_name, static CustomKernels::krnl_info do_create_mean(const string& output_name,
const element::Type& output_type, const element::Type& output_type,
const string& input_name, const string& input_name,
const Shape& input_shape, const Shape& input_shape,
bool backward) bool backward)
{ {
const Shape channel_shape = get_channel_shape(input_shape, "create_mean"); const Shape channel_shape = get_channel_shape(input_shape, "create_mean");
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, channel_shape);
const string entry_point_name = "create_mean_" + output_name; const string entry_point_name = "create_mean_" + output_name;
const size_t output_counts = shape_size<Shape>(input_shape) / input_shape.at(channel_axis); const size_t output_counts = shape_size<Shape>(input_shape) / input_shape.at(channel_axis);
const string kernel_data_type = get_opencl_type_name(output_type); const string kernel_data_type = get_opencl_type_name(output_type);
...@@ -118,26 +110,23 @@ void runtime::intelgpu::do_create_mean(cldnn::topology& topology, ...@@ -118,26 +110,23 @@ void runtime::intelgpu::do_create_mean(cldnn::topology& topology,
} // Main function body } // Main function body
writer.block_end(); writer.block_end();
const cldnn::custom_gpu_primitive op_mean(output_name, const CustomKernelInfo op_bcast_sum(output_name,
{input_name}, channel_shape,
{writer.get_code()}, output_type,
entry_point_name, {input_name},
get_kernel_args(1, 1), {writer.get_code()},
"", entry_point_name);
layout, return {op_bcast_sum};
{1});
topology.add(op_mean);
} }
void runtime::intelgpu::do_create_variance(cldnn::topology& topology, // This creates variance of the input matrix by Channel axis
const string& output_name, static CustomKernels::krnl_info do_create_variance(const string& output_name,
const element::Type& output_type, const element::Type& output_type,
const string& input_name, const string& input_name,
const Shape& input_shape, const Shape& input_shape,
const std::string& mean_name) const std::string& mean_name)
{ {
const Shape channel_shape = get_channel_shape(input_shape, "create_variance"); const Shape channel_shape = get_channel_shape(input_shape, "create_variance");
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, channel_shape);
const string entry_point_name = "create_variance_" + output_name; const string entry_point_name = "create_variance_" + output_name;
const size_t output_counts = shape_size<Shape>(input_shape) / input_shape.at(channel_axis); const size_t output_counts = shape_size<Shape>(input_shape) / input_shape.at(channel_axis);
const string kernel_data_type = get_opencl_type_name(output_type); const string kernel_data_type = get_opencl_type_name(output_type);
...@@ -194,30 +183,26 @@ void runtime::intelgpu::do_create_variance(cldnn::topology& topology, ...@@ -194,30 +183,26 @@ void runtime::intelgpu::do_create_variance(cldnn::topology& topology,
} // Main function body } // Main function body
writer.block_end(); writer.block_end();
const cldnn::custom_gpu_primitive op_variance(output_name, const CustomKernelInfo op_variance(output_name,
{input_name, mean_name}, channel_shape,
{writer.get_code()}, output_type,
entry_point_name, {input_name, mean_name},
get_kernel_args(2, 1), {writer.get_code()},
"", entry_point_name);
layout, return {op_variance};
{1});
topology.add(op_variance);
} }
void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology, static CustomKernels::krnl_info do_batch_norm_operation(const string& output_name,
const string& output_name, const element::Type& output_type,
const element::Type& output_type, double eps,
double eps, const string& input_name,
const string& input_name, const Shape& input_shape,
const Shape& input_shape, const string& gamma_name,
const string& gamma_name, const string& beta_name,
const string& beta_name, const string& mean_name_inp,
const string& mean_name_inp, const string& variance_name_inp)
const string& variance_name_inp)
{ {
const Shape channel_shape = get_channel_shape(input_shape, "batch_norm"); const Shape channel_shape = get_channel_shape(input_shape, "batch_norm");
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, input_shape);
const vector<size_t> gws(input_shape.begin(), input_shape.begin() + 2); const vector<size_t> gws(input_shape.begin(), input_shape.begin() + 2);
const string entry_point_name = "batch_norm_" + output_name; const string entry_point_name = "batch_norm_" + output_name;
const string kernel_data_type = get_opencl_type_name(output_type); const string kernel_data_type = get_opencl_type_name(output_type);
...@@ -265,32 +250,30 @@ void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology, ...@@ -265,32 +250,30 @@ void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology,
} // Main function body } // Main function body
writer.block_end(); writer.block_end();
const vector<cldnn::primitive_id>& inputs = { const vector<string>& inputs = {
input_name, gamma_name, beta_name, mean_name_inp, variance_name_inp}; input_name, gamma_name, beta_name, mean_name_inp, variance_name_inp};
const cldnn::custom_gpu_primitive op_batch_norm(output_name, const CustomKernelInfo op_batch_norm(output_name,
inputs, input_shape,
{writer.get_code()}, output_type,
entry_point_name, inputs,
get_kernel_args(5, 1), {writer.get_code()},
"", entry_point_name,
layout, gws,
gws, {1, 1, 1});
{1, 1, 1}); return {op_batch_norm};
topology.add(op_batch_norm);
} }
void runtime::intelgpu::do_create_variance_back(cldnn::topology& topology, // This creates variance backprop of the input matrix by Channel axis
const string& output_name, static CustomKernels::krnl_info do_create_variance_back(const string& output_name,
const element::Type& output_type, const element::Type& output_type,
double eps, double eps,
const string& input_name, const string& input_name,
const Shape& input_shape, const Shape& input_shape,
const string& mean_name, const string& mean_name,
const string& variance_name, const string& variance_name,
const string& delta_name) const string& delta_name)
{ {
const Shape channel_shape = get_channel_shape(input_shape, "create_variance_back"); const Shape channel_shape = get_channel_shape(input_shape, "create_variance_back");
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, channel_shape);
const string entry_point_name = "create_variance_back_" + output_name; const string entry_point_name = "create_variance_back_" + output_name;
const string kernel_data_type = get_opencl_type_name(output_type); const string kernel_data_type = get_opencl_type_name(output_type);
CodeWriter writer; CodeWriter writer;
...@@ -343,34 +326,34 @@ void runtime::intelgpu::do_create_variance_back(cldnn::topology& topology, ...@@ -343,34 +326,34 @@ void runtime::intelgpu::do_create_variance_back(cldnn::topology& topology,
} // Main function body } // Main function body
writer.block_end(); writer.block_end();
const vector<cldnn::primitive_id>& inputs = {input_name, delta_name, mean_name, variance_name}; const vector<string>& inputs = {input_name, delta_name, mean_name, variance_name};
const cldnn::custom_gpu_primitive op_create_variance_back(output_name, const CustomKernelInfo op_create_variance_back(output_name,
inputs, channel_shape,
{writer.get_code()}, output_type,
entry_point_name, inputs,
get_kernel_args(4, 1), {writer.get_code()},
"", entry_point_name,
layout, gws);
gws); return {op_create_variance_back};
topology.add(op_create_variance_back);
} }
void runtime::intelgpu::do_batch_norm_backprop_operation(cldnn::topology& topology, // This function uses "shape" parameter as input or output Shape
const Shape& shape, // Shape of all other calculated as first axis from the left
const element::Type& type, // Example: output[ 4, 3, 2, 8 ] means out_gamma[ 3 ]
const string& gamma_name, static CustomKernels::krnl_info do_batch_norm_backprop_operation(const Shape& shape,
const string& beta_name, const element::Type& type,
const string& input_name, const string& gamma_name,
const string& mean_name, const string& beta_name,
const string& variance_name, const string& input_name,
const string& delta_name, const string& mean_name,
double eps, const string& variance_name,
const string& output_name, const string& delta_name,
const string& output_gamma_name, double eps,
const string& output_beta_name) const string& output_name,
const string& output_gamma_name,
const string& output_beta_name)
{ {
const Shape channel_shape = get_channel_shape(shape, "batch_norm_backprop"); const Shape channel_shape = get_channel_shape(shape, "batch_norm_backprop");
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(type, shape);
const string entry_point_name = "batch_norm_backprop_" + output_name; const string entry_point_name = "batch_norm_backprop_" + output_name;
const size_t r_axes_size = shape_size(shape) / shape_size(channel_shape); const size_t r_axes_size = shape_size(shape) / shape_size(channel_shape);
const string kernel_data_type = get_opencl_type_name(type); const string kernel_data_type = get_opencl_type_name(type);
...@@ -391,7 +374,7 @@ void runtime::intelgpu::do_batch_norm_backprop_operation(cldnn::topology& topolo ...@@ -391,7 +374,7 @@ void runtime::intelgpu::do_batch_norm_backprop_operation(cldnn::topology& topolo
{ // Main function body { // Main function body
// Main loops // Main loops
gws = runtime::intelgpu::generate_loops(writer, shape, true); gws = generate_loops(writer, shape, true);
writer << kernel_data_type << " stddev = sqrt(variance[i" << channel_axis << "] + " << eps writer << kernel_data_type << " stddev = sqrt(variance[i" << channel_axis << "] + " << eps
<< ");\n"; << ");\n";
...@@ -404,25 +387,139 @@ void runtime::intelgpu::do_batch_norm_backprop_operation(cldnn::topology& topolo ...@@ -404,25 +387,139 @@ void runtime::intelgpu::do_batch_norm_backprop_operation(cldnn::topology& topolo
<< channel_axis << "]) / " << r_axes_size << ");\n"; << channel_axis << "]) / " << r_axes_size << ");\n";
// Closing brackets for main loops // Closing brackets for main loops
runtime::intelgpu::generate_loops(writer, shape, false); generate_loops(writer, shape, false);
} // Main function body } // Main function body
writer.block_end(); writer.block_end();
const vector<cldnn::primitive_id>& inputs = {input_name, const vector<string>& inputs = {input_name,
delta_name, delta_name,
mean_name, mean_name,
variance_name, variance_name,
gamma_name, gamma_name,
output_gamma_name, output_gamma_name,
output_beta_name}; output_beta_name};
const cldnn::custom_gpu_primitive op_batch_norm_backprop(output_name, const CustomKernelInfo op_batch_norm_backprop(
inputs, output_name, shape, type, inputs, {writer.get_code()}, entry_point_name, gws);
{writer.get_code()}, return {op_batch_norm_backprop};
entry_point_name, }
get_kernel_args(7, 1),
"", CustomKernels::krnl_info
layout, CustomKernels::build_krnl(const shared_ptr<op::BatchNormInference>& op) const
gws); {
topology.add(op_batch_norm_backprop); return do_batch_norm_operation(op->get_output_tensor_name(0),
op->get_output_element_type(0),
op->get_eps_value(),
op->get_input_tensor_name(2),
op->get_input_shape(2),
op->get_input_tensor_name(0),
op->get_input_tensor_name(1),
op->get_input_tensor_name(3),
op->get_input_tensor_name(4));
}
CustomKernels::krnl_info
CustomKernels::build_krnl(const shared_ptr<op::BatchNormTraining>& op) const
{
CustomKernels::krnl_info result;
string mean_name;
string variance_name;
if (op->get_inputs().size() < 3 || op->get_outputs().empty())
{
arguments_check(op, 3, 1); // throw exception in this case
}
if (op->get_outputs().size() == 3)
{
arguments_check(op, 3, 3);
mean_name = op->get_output_tensor_name(1);
variance_name = op->get_output_tensor_name(2);
CustomKernels::krnl_info mean = do_create_mean(mean_name,
op->get_output_element_type(0),
op->get_input_tensor_name(2),
op->get_input_shape(2),
false);
result.insert(result.end(), mean.begin(), mean.end());
CustomKernels::krnl_info variance = do_create_variance(variance_name,
op->get_output_element_type(0),
op->get_input_tensor_name(2),
op->get_input_shape(2),
mean_name);
result.insert(result.end(), variance.begin(), variance.end());
}
if (op->get_outputs().size() == 1 || op->get_outputs().size() == 3)
{
if (mean_name.empty() || variance_name.empty())
{
arguments_check(op, 5, 1);
mean_name = op->get_input_tensor_name(3);
variance_name = op->get_input_tensor_name(4);
}
CustomKernels::krnl_info batch_norm =
do_batch_norm_operation(op->get_output_tensor_name(0),
op->get_output_element_type(0),
op->get_eps_value(),
op->get_input_tensor_name(2),
op->get_input_shape(2),
op->get_input_tensor_name(0),
op->get_input_tensor_name(1),
mean_name,
variance_name);
result.insert(result.end(), batch_norm.begin(), batch_norm.end());
}
else
{
arguments_check(op, 5, 1); // throw exception in this case
}
return result;
}
CustomKernels::krnl_info
CustomKernels::build_krnl(const shared_ptr<op::BatchNormTrainingBackprop>& op) const
{
CustomKernels::krnl_info result;
CustomKernels::krnl_info mean = do_create_mean(op->get_output_tensor_name(2), // d_beta
op->get_output_element_type(2),
op->get_input_tensor_name(5), // delta
op->get_input_shape(5),
true);
result.insert(result.end(), mean.begin(), mean.end());
CustomKernels::krnl_info variance =
do_create_variance_back(op->get_output_tensor_name(1), // d_gamma
op->get_output_element_type(1),
op->get_eps_value(),
op->get_input_tensor_name(2), // input
op->get_input_shape(2),
op->get_input_tensor_name(3), // gamma
op->get_input_tensor_name(4), // beta
op->get_input_tensor_name(5)); // delta
result.insert(result.end(), variance.begin(), variance.end());
CustomKernels::krnl_info batch_norm =
do_batch_norm_backprop_operation(op->get_input_shape(2),
op->get_input_element_type(2),
op->get_input_tensor_name(0),
op->get_input_tensor_name(1),
op->get_input_tensor_name(2),
op->get_input_tensor_name(3),
op->get_input_tensor_name(4),
op->get_input_tensor_name(5),
op->get_eps_value(),
op->get_output_tensor_name(0),
op->get_output_tensor_name(1),
op->get_output_tensor_name(2));
result.insert(result.end(), batch_norm.begin(), batch_norm.end());
return result;
} }
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <CPP/topology.hpp>
#include "ngraph/shape.hpp"
#include "ngraph/type/element_type.hpp"
namespace ngraph
{
namespace runtime
{
namespace intelgpu
{
// This implements BatchNorm nGraph operation
// nGraph uses channels in this operation but clDNN uses full input data
void do_batch_norm_operation(cldnn::topology& topology,
const std::string& output_name,
const element::Type& output_type,
double eps,
const std::string& input_name,
const Shape& input_shape,
const std::string& gamma_name,
const std::string& beta_name,
const std::string& mean_name,
const std::string& variance_name);
// This creates mean of the input matrix by Channel axis
void do_create_mean(cldnn::topology& topology,
const std::string& output_name,
const element::Type& output_type,
const std::string& input_name,
const Shape& input_shape,
bool backward);
// This creates variance of the input matrix by Channel axis
void do_create_variance(cldnn::topology& topology,
const std::string& output_name,
const element::Type& output_type,
const std::string& input_name,
const Shape& input_shape,
const std::string& mean_name);
// This creates variance backprop of the input matrix by Channel axis
void do_create_variance_back(cldnn::topology& topology,
const std::string& output_name,
const element::Type& output_type,
double eps,
const std::string& input_name,
const Shape& input_shape,
const std::string& mean_name,
const std::string& variance_name,
const std::string& delta_name);
// This function uses "shape" parameter as input or output Shape
// Shape of all other calculated as first axis from the left
// Example: output[ 4, 3, 2, 8 ] means out_gamma[ 3 ]
void do_batch_norm_backprop_operation(cldnn::topology& topology,
const Shape& shape,
const element::Type& type,
const std::string& gamma_name,
const std::string& beta_name,
const std::string& input_name,
const std::string& mean_name,
const std::string& variance_name,
const std::string& delta_name,
double eps,
const std::string& output_name,
const std::string& output_gamma_name,
const std::string& output_beta_name);
}
}
}
...@@ -1277,18 +1277,16 @@ CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::Select>& ...@@ -1277,18 +1277,16 @@ CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::Select>&
return {krn_ret}; return {krn_ret};
} }
void runtime::intelgpu::do_logic_kernel(cldnn::topology& topology, static CustomKernels::krnl_info do_logic_kernel(const shared_ptr<Node>& op, const string& operation)
const string& input0_name,
const Shape& input0_shape,
const element::Type& input0_type,
const string& input1_name,
const Shape& input1_shape,
const string& output_name,
const Shape& output_shape,
const element::Type& output_type,
const string& operation)
{ {
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape); const string& input0_name = op->get_input_tensor_name(0);
const Shape& input0_shape = op->get_input_shape(0);
const element::Type& input0_type = op->get_input_element_type(0);
const string& input1_name = op->get_input_tensor_name(1);
const Shape& input1_shape = op->get_input_shape(1);
const string& output_name = op->get_output_tensor_name(0);
const Shape& output_shape = op->get_output_shape(0);
const element::Type& output_type = op->get_output_element_type(0);
const string entry_point_name = "logic_" + output_name; const string entry_point_name = "logic_" + output_name;
CodeWriter writer; CodeWriter writer;
vector<size_t> gws; vector<size_t> gws;
...@@ -1313,15 +1311,14 @@ void runtime::intelgpu::do_logic_kernel(cldnn::topology& topology, ...@@ -1313,15 +1311,14 @@ void runtime::intelgpu::do_logic_kernel(cldnn::topology& topology,
} }
writer.block_end(); writer.block_end();
const cldnn::custom_gpu_primitive op_logical(output_name, const CustomKernelInfo op_logical(output_name,
{input0_name, input1_name}, output_shape,
{writer.get_code()}, output_type,
entry_point_name, {input0_name, input1_name},
get_kernel_args(2, 1), {writer.get_code()},
"", entry_point_name,
layout, gws);
gws); return {op_logical};
topology.add(op_logical);
} }
void runtime::intelgpu::do_eltwise_kernel(cldnn::topology& topology, void runtime::intelgpu::do_eltwise_kernel(cldnn::topology& topology,
...@@ -2333,3 +2330,43 @@ size_t runtime::intelgpu::get_max_memory_rss() ...@@ -2333,3 +2330,43 @@ size_t runtime::intelgpu::get_max_memory_rss()
return result; return result;
} }
CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::And>& op) const
{
return do_logic_kernel(op, " && ");
}
CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::Equal>& op) const
{
return do_logic_kernel(op, " == ");
}
CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::Greater>& op) const
{
return do_logic_kernel(op, " > ");
}
CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::GreaterEq>& op) const
{
return do_logic_kernel(op, " >= ");
}
CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::Less>& op) const
{
return do_logic_kernel(op, " < ");
}
CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::LessEq>& op) const
{
return do_logic_kernel(op, " <= ");
}
CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::NotEqual>& op) const
{
return do_logic_kernel(op, " != ");
}
CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::Or>& op) const
{
return do_logic_kernel(op, " || ");
}
...@@ -100,17 +100,6 @@ namespace ngraph ...@@ -100,17 +100,6 @@ namespace ngraph
const element::Type& output_type, const element::Type& output_type,
size_t concat_axis); size_t concat_axis);
void do_logic_kernel(cldnn::topology& topology,
const std::string& input0_name,
const Shape& input0_shape,
const element::Type& input0_type,
const std::string& input1_name,
const Shape& input1_shape,
const std::string& output_name,
const Shape& output_shape,
const element::Type& output_type,
const std::string& operation);
void do_eltwise_kernel(cldnn::topology& topology, void do_eltwise_kernel(cldnn::topology& topology,
const std::string& input0_name, const std::string& input0_name,
const Shape& input0_shape, const Shape& input0_shape,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment