Commit 45b50d06 authored by shssf's avatar shssf Committed by Robert Kimball

IntelGPU backend: BatchNorm operation completly redeveloped (#1318)

parent 39278e7d
...@@ -533,35 +533,58 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func) ...@@ -533,35 +533,58 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
} }
const string& output_name = op->get_outputs().begin()->get_tensor().get_name(); const string& output_name = op->get_outputs().begin()->get_tensor().get_name();
const Shape& output_shape = op->get_outputs().begin()->get_shape();
const element::Type& output_type =
op->get_outputs().begin()->get_tensor().get_element_type();
const string& gamma_name = op->get_inputs().at(0).get_tensor().get_name(); const string& gamma_name = op->get_inputs().at(0).get_tensor().get_name();
const Shape& gamma_shape = op->get_inputs().at(0).get_shape();
const string& beta_name = op->get_inputs().at(1).get_tensor().get_name(); const string& beta_name = op->get_inputs().at(1).get_tensor().get_name();
const string& input_name = op->get_inputs().at(2).get_tensor().get_name(); const string& input_name = op->get_inputs().at(2).get_tensor().get_name();
const Shape& input_shape = op->get_inputs().at(2).get_shape(); const Shape& input_shape = op->get_inputs().at(2).get_shape();
string mean_name;
string variance_name;
if (op->get_outputs().size() == 3)
{
arguments_check(op, 3, 3);
mean_name = op->get_outputs().at(1).get_tensor().get_name();
variance_name = op->get_outputs().at(2).get_tensor().get_name();
do_create_mean(
topology, mean_name, gamma_shape, output_type, input_name, input_shape);
do_create_variance(topology,
variance_name,
gamma_shape,
output_type,
input_name,
input_shape,
mean_name);
}
if (op->get_outputs().size() == 1) if (op->get_outputs().size() == 1 || op->get_outputs().size() == 3)
{ {
arguments_check(op, 5, 1); if (mean_name.empty() || variance_name.empty())
{
arguments_check(op, 5, 1);
const string& mean_name = op->get_inputs().at(3).get_tensor().get_name(); mean_name = op->get_inputs().at(3).get_tensor().get_name();
const string& variance_name = op->get_inputs().at(4).get_tensor().get_name(); variance_name = op->get_inputs().at(4).get_tensor().get_name();
}
do_batch_norm_operation(topology, do_batch_norm_operation(topology,
output_name, output_name,
output_shape,
output_type,
eps, eps,
input_name, input_name,
input_shape, input_shape,
gamma_name, gamma_name,
gamma_shape,
beta_name, beta_name,
mean_name, mean_name,
variance_name); variance_name);
} }
else if (op->get_outputs().size() == 3)
{
arguments_check(op, 3, 3);
do_batch_norm_operation(
topology, output_name, eps, input_name, input_shape, gamma_name, beta_name);
}
else else
{ {
arguments_check(op, 5, 1); // throw exception in this case arguments_check(op, 5, 1); // throw exception in this case
......
...@@ -16,9 +16,11 @@ ...@@ -16,9 +16,11 @@
#include <CPP/batch_norm.hpp> #include <CPP/batch_norm.hpp>
#include <CPP/concatenation.hpp> #include <CPP/concatenation.hpp>
#include <CPP/custom_gpu_primitive.hpp>
#include <CPP/scale.hpp> #include <CPP/scale.hpp>
#include <CPP/split.hpp> #include <CPP/split.hpp>
#include "ngraph/runtime/intelgpu/code_writer.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp" #include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp" #include "ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp"
...@@ -27,109 +29,293 @@ ...@@ -27,109 +29,293 @@
using namespace std; using namespace std;
using namespace ngraph; using namespace ngraph;
static string do_matrix_split(cldnn::topology& topology, static vector<cldnn_arg> parameters_1inp_1out = {{arg_input, 0}, {arg_output, 0}};
const string& name, static vector<cldnn_arg> parameters_2inp_1out = {{arg_input, 0}, {arg_input, 1}, {arg_output, 0}};
const vector<pair<cldnn::primitive_id, cldnn::tensor>>& offsets) static vector<cldnn_arg> parameters_5inp_1out = {{arg_input, 0},
{arg_input, 1},
{arg_input, 2},
{arg_input, 3},
{arg_input, 4},
{arg_output, 0}};
static string array_dims(const Shape& dimentions)
{ {
const string result = name + "_split"; string buffer;
for (auto const& dim : dimentions)
{
buffer += "[" + to_string(dim) + "]";
}
const cldnn::split op_split(result, name, offsets); return buffer;
topology.add(op_split);
return result;
} }
static string get_batch_norm_mean(cldnn::topology& topology, const string& input_name) static string access_dims(const Shape& dimentions, const AxisSet& axis = {})
{ {
throw invalid_argument( size_t var_idx = 0;
"intelgpu::get_batch_norm_mean() Calculation matrix mean is not yet supported."); string buffer;
for (auto const& i : dimentions)
{
if (axis.find(var_idx) == axis.end())
{
buffer += "[i" + to_string(var_idx) + "]";
}
++var_idx;
}
return buffer;
} }
static string get_batch_norm_variance(cldnn::topology& topology, void runtime::intelgpu::do_create_mean(cldnn::topology& topology,
const string& input_name, const string& output_name,
const string& mean_name) const Shape& output_shape,
const element::Type& output_type,
const string& input_name,
const Shape& input_shape)
{ {
throw invalid_argument( if (input_shape.size() < 2 || input_shape.size() > 4)
"intelgpu::get_batch_norm_variance() Calculation matrix variance is not yet supported."); {
throw invalid_argument("intelgpu::do_create_mean_variance() wrong input shapes.");
}
// According to the documentation, input data channel is always being axis 1
// Assumed the second dimension from the left. Example {0, 1, 0, 0} or {0, 1}
// Also, input data must be at least 2D array
const size_t channel_axis = 1;
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const string entry_point_name = "create_mean_" + output_name;
const size_t output_counts = shape_size<Shape>(input_shape) / input_shape.at(channel_axis);
codegen::CodeWriter writer;
writer << "__kernel void " << entry_point_name << "( const __global float input"
<< array_dims(input_shape) << ", __global float output" << array_dims(output_shape)
<< ")\n";
writer.block_begin();
{ // Main function body
// Loop for Channel axis 1
writer << "for (uint i" << channel_axis << " = 0; i" << channel_axis << " < "
<< input_shape.at(channel_axis) << "; ++i" << channel_axis << ")\n";
writer.block_begin();
{
writer << "float sum = 0.0f;\n";
size_t var_idx = 0;
// Main loops
for (auto const& i : input_shape)
{
if (var_idx != channel_axis)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i
<< "; ++i" << var_idx << ")\n";
writer.block_begin();
}
++var_idx;
}
writer << "sum += input" << access_dims(input_shape) << ";\n";
var_idx = 0;
// Closing brackets for main loops
for (auto const& i : input_shape)
{
if (var_idx != channel_axis)
{
writer.block_end();
}
++var_idx;
}
writer << "output[i" << channel_axis << "] = sum / " << output_counts << ";\n";
} // Closing brackets for Channel axis loop
writer.block_end();
} // Main function body
writer.block_end();
const cldnn::custom_gpu_primitive op_mean(output_name,
{input_name},
{writer.get_code()},
entry_point_name,
parameters_1inp_1out,
"",
layout,
{1});
topology.add(op_mean);
}
void runtime::intelgpu::do_create_variance(cldnn::topology& topology,
const string& output_name,
const Shape& output_shape,
const element::Type& output_type,
const string& input_name,
const Shape& input_shape,
const std::string& mean_name)
{
if (input_shape.size() < 2 || input_shape.size() > 4)
{
throw invalid_argument("intelgpu::do_create_mean_variance() wrong input shapes.");
}
// According to the documentation, input data channel is always being axis 1
// Assumed the second dimension from the left. Example {0, 1, 0, 0} or {0, 1}
// Also, input data must be at least 2D array
const size_t channel_axis = 1;
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const string entry_point_name = "create_variance_" + output_name;
const size_t output_counts = shape_size<Shape>(input_shape) / input_shape.at(channel_axis);
codegen::CodeWriter writer;
writer << "__kernel void " << entry_point_name << "( const __global float input"
<< array_dims(input_shape) << ", const __global float mean" << array_dims(output_shape)
<< ", __global float output" << array_dims(output_shape) << ")\n";
writer.block_begin();
{ // Main function body
// Loop for Channel axis 1
writer << "for (uint i" << channel_axis << " = 0; i" << channel_axis << " < "
<< input_shape.at(channel_axis) << "; ++i" << channel_axis << ")\n";
writer.block_begin();
{
writer << "float sum = 0.0f;\n";
size_t var_idx = 0;
// Main loops
for (auto const& i : input_shape)
{
if (var_idx != channel_axis)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i
<< "; ++i" << var_idx << ")\n";
writer.block_begin();
}
++var_idx;
}
writer << "float mean_diff = input" << access_dims(input_shape) << " - mean[i"
<< channel_axis << "];\n";
writer << "sum += mean_diff * mean_diff;\n";
var_idx = 0;
// Closing brackets for main loops
for (auto const& i : input_shape)
{
if (var_idx != channel_axis)
{
writer.block_end();
}
++var_idx;
}
writer << "output[i" << channel_axis << "] = sum / " << output_counts << ";\n";
} // Closing brackets for Channel axis loop
writer.block_end();
} // Main function body
writer.block_end();
const cldnn::custom_gpu_primitive op_variance(output_name,
{input_name, mean_name},
{writer.get_code()},
entry_point_name,
parameters_2inp_1out,
"",
layout,
{1});
topology.add(op_variance);
} }
void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology, void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology,
const string& output_name, const string& output_name,
const Shape& output_shape,
const element::Type& output_type,
double eps, double eps,
const string& input_name, const string& input_name,
const Shape& input_shape, const Shape& input_shape,
const string& gamma_name, const string& gamma_name,
const Shape& gamma_shape,
const string& beta_name, const string& beta_name,
const string& mean_name_inp, const string& mean_name_inp,
const string& variance_name_inp) const string& variance_name_inp)
{ {
vector<pair<cldnn::primitive_id, cldnn::tensor>> split_offsets;
vector<pair<cldnn::primitive_id, cldnn::tensor>> vec_offsets;
vector<cldnn::primitive_id> dim_set;
if (input_shape.size() < 2 || input_shape.size() > 4) if (input_shape.size() < 2 || input_shape.size() > 4)
{ {
throw invalid_argument("intelgpu::do_batch_norm_operation() wrong input shape."); throw invalid_argument("intelgpu::do_batch_norm_operation() wrong input shapes.");
} }
// According to the documentation, input data channel is always being axis 1 // According to the documentation, input data channel is always being axis 1
// Assumed the second dimension from the left. Example {0, 1, 0, 0} or {0, 1} // Assumed the second dimension from the left. Example {0, 1, 0, 0} or {0, 1}
// Also, input data must be at least 2D array // Also, input data must be at least 2D array
const size_t shape_channel = 1; const size_t channel_axis = 1;
const size_t cldnn_channel = 4 - input_shape.size() + shape_channel; const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const cldnn::concatenation::concatenation_axis direction = const string entry_point_name = "batch_norm_" + output_name;
runtime::intelgpu::IntelGPULayout::get_cldnn_axis(cldnn_channel); codegen::CodeWriter writer;
const size_t split_arr_count = input_shape.at(shape_channel); writer << "__kernel void " << entry_point_name << "( const __global float input"
for (size_t i = 0; i < split_arr_count; ++i) << array_dims(input_shape) << ", const __global float gamma" << array_dims(gamma_shape)
{ << ", const __global float beta" << array_dims(gamma_shape)
const string str_i = to_string(i); << ", const __global float mean" << array_dims(gamma_shape)
const cldnn::tensor vec_offset(0, 0, i, 0); << ", const __global float variance" << array_dims(gamma_shape)
vec_offsets.push_back(pair<cldnn::primitive_id, cldnn::tensor>(str_i, vec_offset)); << ", __global float output" << array_dims(output_shape) << ")\n";
vector<cldnn::tensor::value_type> offset({0, 0, 0, 0}); // No action by default writer.block_begin();
offset.at(cldnn_channel) = i; { // Main function body
const cldnn::tensor input_offset(offset.at(0), offset.at(1), offset.at(3), offset.at(2)); // Loop for Channel axis 1
split_offsets.push_back(pair<cldnn::primitive_id, cldnn::tensor>(str_i, input_offset)); writer << "for (uint i" << channel_axis << " = 0; i" << channel_axis << " < "
} << output_shape.at(channel_axis) << "; ++i" << channel_axis << ")\n";
writer.block_begin();
{
size_t var_idx = 0;
// Main loops
for (auto const& i : output_shape)
{
if (var_idx != channel_axis)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i
<< "; ++i" << var_idx << ")\n";
writer.block_begin();
}
++var_idx;
}
string mean_name = mean_name_inp; writer << "float normalized = (input" << access_dims(input_shape) << " - mean[i"
if (mean_name_inp.empty()) << channel_axis << "]) / ("
{ << "sqrt(variance[i" << channel_axis << "] + " << eps << ")"
mean_name = get_batch_norm_mean(topology, input_name); << ");\n";
}
string variance_name = variance_name_inp; writer << "output" << access_dims(output_shape) << " = normalized * gamma[i"
if (variance_name_inp.empty()) << channel_axis << "] + beta[i" << channel_axis << "];\n";
{
variance_name = get_batch_norm_variance(topology, input_name, mean_name);
}
const string input_split_name = do_matrix_split(topology, input_name, split_offsets);
const string mean_split_name = do_matrix_split(topology, mean_name, vec_offsets);
const string variance_split_name = do_matrix_split(topology, variance_name, vec_offsets);
const string gamma_split_name = do_matrix_split(topology, gamma_name, vec_offsets);
const string beta_split_name = do_matrix_split(topology, beta_name, vec_offsets);
for (size_t i = 0; i < split_arr_count; ++i) var_idx = 0;
{ // Closing brackets for main loops
const string suf = ':' + to_string(i); for (auto const& i : output_shape)
const string out_bn_name = output_name + "_out_bn"; {
if (var_idx != channel_axis)
const cldnn::batch_norm cldd_batchnorm(out_bn_name + suf, {
input_split_name + suf, writer.block_end();
mean_split_name + suf, }
variance_split_name + suf, ++var_idx;
eps); }
topology.add(cldd_batchnorm);
const cldnn::scale op_scale( } // Closing brackets for Channel axis loop
output_name + suf, out_bn_name + suf, gamma_split_name + suf, beta_split_name + suf); writer.block_end();
topology.add(op_scale);
dim_set.push_back(output_name + suf); } // Main function body
} writer.block_end();
const cldnn::concatenation op_concat(output_name, dim_set, direction); const vector<cldnn::primitive_id>& inputs = {
topology.add(op_concat); input_name, gamma_name, beta_name, mean_name_inp, variance_name_inp};
const cldnn::custom_gpu_primitive op_batch_norm(output_name,
inputs,
{writer.get_code()},
entry_point_name,
parameters_5inp_1out,
"",
layout,
{1});
topology.add(op_batch_norm);
} }
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <CPP/topology.hpp> #include <CPP/topology.hpp>
#include "ngraph/shape.hpp" #include "ngraph/shape.hpp"
#include "ngraph/type/element_type.hpp"
namespace ngraph namespace ngraph
{ {
...@@ -27,22 +28,36 @@ namespace ngraph ...@@ -27,22 +28,36 @@ namespace ngraph
namespace intelgpu namespace intelgpu
{ {
// This implements BatchNorm nGraph operation // This implements BatchNorm nGraph operation
// Since nGraph uses channels in this operation but clDNN uses full input data // nGraph uses channels in this operation but clDNN uses full input data
// at one time we have to use following algorithm:
// 1. Split all input data arrays into several matrices by channel axis
// 2. Independently do cldnn::batch_norm on particular matrix
// 3. Every result of the cldnn::batch_norm must be scaled and
// shifted because cldnn::batch_norm dosn't use gamma and beta
// 4. Concatenate all results into output matrix by channel axis
void do_batch_norm_operation(cldnn::topology& topology, void do_batch_norm_operation(cldnn::topology& topology,
const std::string& output_name, const std::string& output_name,
const Shape& output_shape,
const element::Type& output_type,
double eps, double eps,
const std::string& input_name, const std::string& input_name,
const Shape& input_shape, const Shape& input_shape,
const std::string& gamma_name, const std::string& gamma_name,
const Shape& gamma_shape,
const std::string& beta_name, const std::string& beta_name,
const std::string& mean_name = std::string(), const std::string& mean_name,
const std::string& variance_name = std::string()); const std::string& variance_name);
// This creates mean of the input matrix by Channel axis
void do_create_mean(cldnn::topology& topology,
const std::string& output_name,
const Shape& output_shape,
const element::Type& output_type,
const std::string& input_name,
const Shape& input_shape);
// This creates mean of the input matrix by Channel axis
void do_create_variance(cldnn::topology& topology,
const std::string& output_name,
const Shape& output_shape,
const element::Type& output_type,
const std::string& input_name,
const Shape& input_shape,
const std::string& mean_name);
} }
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment