Commit da352aa1 authored by shssf's avatar shssf Committed by Scott Cyphers

IntelGPU backend: BatchNormBackprop operation (#1443)

* IntelGPU backend: BatchNormBackprop operation

* PR1443. Requested refactoring done
parent 40ddf45a
...@@ -767,6 +767,45 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func) ...@@ -767,6 +767,45 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
pad_below, pad_below,
pad_interior); pad_interior);
} }
else if ("BatchNormBackprop" == op->description())
{
arguments_check(op, 6, 3);
const shared_ptr<op::BatchNormBackprop> batch_norm =
static_pointer_cast<op::BatchNormBackprop>(op);
const double eps = batch_norm->get_eps_value();
do_create_mean(topology,
get_output_name(op, 2), // d_beta
get_output_type(op, 2),
get_input_name(op, 5), // delta
get_input_shape(op, 5),
true);
do_create_variance_back(topology,
get_output_name(op, 1), // d_gamma
get_output_type(op, 1),
eps,
get_input_name(op, 2), // input
get_input_shape(op, 2),
get_input_name(op, 3), // gamma
get_input_name(op, 4), // beta
get_input_name(op, 5)); // delta
do_batch_norm_backprop_operation(topology,
get_input_shape(op, 2),
get_input_type(op, 2),
get_input_name(op, 0),
get_input_name(op, 1),
get_input_name(op, 2),
get_input_name(op, 3),
get_input_name(op, 4),
get_input_name(op, 5),
eps,
get_output_name(op, 0),
get_output_name(op, 1),
get_output_name(op, 2));
}
else if ("BatchNorm" == op->description()) else if ("BatchNorm" == op->description())
{ {
const shared_ptr<op::BatchNorm> batch_norm = static_pointer_cast<op::BatchNorm>(op); const shared_ptr<op::BatchNorm> batch_norm = static_pointer_cast<op::BatchNorm>(op);
...@@ -788,14 +827,13 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func) ...@@ -788,14 +827,13 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
do_create_mean(topology, do_create_mean(topology,
mean_name, mean_name,
get_input_shape(op),
get_output_type(op), get_output_type(op),
get_input_name(op, 2), get_input_name(op, 2),
get_input_shape(op, 2)); get_input_shape(op, 2),
false);
do_create_variance(topology, do_create_variance(topology,
variance_name, variance_name,
get_input_shape(op),
get_output_type(op), get_output_type(op),
get_input_name(op, 2), get_input_name(op, 2),
get_input_shape(op, 2), get_input_shape(op, 2),
...@@ -814,13 +852,11 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func) ...@@ -814,13 +852,11 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
do_batch_norm_operation(topology, do_batch_norm_operation(topology,
get_output_name(op), get_output_name(op),
get_output_shape(op),
get_output_type(op), get_output_type(op),
eps, eps,
get_input_name(op, 2), get_input_name(op, 2),
get_input_shape(op, 2), get_input_shape(op, 2),
get_input_name(op, 0), get_input_name(op, 0),
get_input_shape(op, 0),
get_input_name(op, 1), get_input_name(op, 1),
mean_name, mean_name,
variance_name); variance_name);
......
...@@ -30,29 +30,39 @@ ...@@ -30,29 +30,39 @@
using namespace std; using namespace std;
using namespace ngraph; using namespace ngraph;
// According to the documentation, input data channel is always being axis 1
// Assumed the second dimension from the left. Example {0, 1, 0, 0} or {0, 1}
// Also, input data must be at least 2D array
static const size_t channel_axis = 1;
static Shape get_channel_shape(const Shape& shape, const string& function_name)
{
if (shape.size() < channel_axis + 1)
{
const string err = "intelgpu::" + function_name + "() input_shape" +
runtime::intelgpu::array_dims(shape) + " should be at least " +
to_string(channel_axis + 1) + "D.";
throw invalid_argument(err);
}
return {shape.at(channel_axis)};
}
void runtime::intelgpu::do_create_mean(cldnn::topology& topology, void runtime::intelgpu::do_create_mean(cldnn::topology& topology,
const string& output_name, const string& output_name,
const Shape& output_shape,
const element::Type& output_type, const element::Type& output_type,
const string& input_name, const string& input_name,
const Shape& input_shape) const Shape& input_shape,
bool backward)
{ {
if (input_shape.size() < 2 || input_shape.size() > 4) const Shape channel_shape = get_channel_shape(input_shape, "create_mean");
{ const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, channel_shape);
throw invalid_argument("intelgpu::do_create_mean_variance() wrong input shapes.");
}
// According to the documentation, input data channel is always being axis 1
// Assumed the second dimension from the left. Example {0, 1, 0, 0} or {0, 1}
// Also, input data must be at least 2D array
const size_t channel_axis = 1;
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const string entry_point_name = "create_mean_" + output_name; const string entry_point_name = "create_mean_" + output_name;
const size_t output_counts = shape_size<Shape>(input_shape) / input_shape.at(channel_axis); const size_t output_counts = shape_size<Shape>(input_shape) / input_shape.at(channel_axis);
codegen::CodeWriter writer; codegen::CodeWriter writer;
writer << "__kernel void " << entry_point_name << "( const __global float input" writer << "__kernel void " << entry_point_name << "( const __global float input"
<< array_dims(input_shape) << ", __global float output" << array_dims(output_shape) << array_dims(input_shape) << ", __global float output" << array_dims(channel_shape)
<< ")\n"; << ")\n";
writer.block_begin(); writer.block_begin();
...@@ -89,7 +99,12 @@ void runtime::intelgpu::do_create_mean(cldnn::topology& topology, ...@@ -89,7 +99,12 @@ void runtime::intelgpu::do_create_mean(cldnn::topology& topology,
} }
++var_idx; ++var_idx;
} }
writer << "output[i" << channel_axis << "] = sum / " << output_counts << ";\n"; writer << "output[i" << channel_axis << "] = sum";
if (!backward)
{
writer << " / " << output_counts;
}
writer << ";\n";
} // Closing brackets for Channel axis loop } // Closing brackets for Channel axis loop
writer.block_end(); writer.block_end();
...@@ -110,29 +125,20 @@ void runtime::intelgpu::do_create_mean(cldnn::topology& topology, ...@@ -110,29 +125,20 @@ void runtime::intelgpu::do_create_mean(cldnn::topology& topology,
void runtime::intelgpu::do_create_variance(cldnn::topology& topology, void runtime::intelgpu::do_create_variance(cldnn::topology& topology,
const string& output_name, const string& output_name,
const Shape& output_shape,
const element::Type& output_type, const element::Type& output_type,
const string& input_name, const string& input_name,
const Shape& input_shape, const Shape& input_shape,
const std::string& mean_name) const std::string& mean_name)
{ {
if (input_shape.size() < 2 || input_shape.size() > 4) const Shape channel_shape = get_channel_shape(input_shape, "create_variance");
{ const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, channel_shape);
throw invalid_argument("intelgpu::do_create_mean_variance() wrong input shapes.");
}
// According to the documentation, input data channel is always being axis 1
// Assumed the second dimension from the left. Example {0, 1, 0, 0} or {0, 1}
// Also, input data must be at least 2D array
const size_t channel_axis = 1;
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const string entry_point_name = "create_variance_" + output_name; const string entry_point_name = "create_variance_" + output_name;
const size_t output_counts = shape_size<Shape>(input_shape) / input_shape.at(channel_axis); const size_t output_counts = shape_size<Shape>(input_shape) / input_shape.at(channel_axis);
codegen::CodeWriter writer; codegen::CodeWriter writer;
writer << "__kernel void " << entry_point_name << "( const __global float input" writer << "__kernel void " << entry_point_name << "( const __global float input"
<< array_dims(input_shape) << ", const __global float mean" << array_dims(output_shape) << array_dims(input_shape) << ", const __global float mean" << array_dims(channel_shape)
<< ", __global float output" << array_dims(output_shape) << ")\n"; << ", __global float output" << array_dims(channel_shape) << ")\n";
writer.block_begin(); writer.block_begin();
{ // Main function body { // Main function body
...@@ -193,52 +199,42 @@ void runtime::intelgpu::do_create_variance(cldnn::topology& topology, ...@@ -193,52 +199,42 @@ void runtime::intelgpu::do_create_variance(cldnn::topology& topology,
void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology, void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology,
const string& output_name, const string& output_name,
const Shape& output_shape,
const element::Type& output_type, const element::Type& output_type,
double eps, double eps,
const string& input_name, const string& input_name,
const Shape& input_shape, const Shape& input_shape,
const string& gamma_name, const string& gamma_name,
const Shape& gamma_shape,
const string& beta_name, const string& beta_name,
const string& mean_name_inp, const string& mean_name_inp,
const string& variance_name_inp) const string& variance_name_inp)
{ {
if (input_shape.size() < 2 || input_shape.size() > 4) const Shape channel_shape = get_channel_shape(input_shape, "batch_norm");
{ const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, input_shape);
throw invalid_argument("intelgpu::do_batch_norm_operation() wrong input shapes.");
}
// According to the documentation, input data channel is always being axis 1
// Assumed the second dimension from the left. Example {0, 1, 0, 0} or {0, 1}
// Also, input data must be at least 2D array
const size_t channel_axis = 1;
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const string entry_point_name = "batch_norm_" + output_name; const string entry_point_name = "batch_norm_" + output_name;
codegen::CodeWriter writer; codegen::CodeWriter writer;
vector<size_t> gws; vector<size_t> gws;
writer << "__kernel void " << entry_point_name << "( const __global float input" writer << "__kernel void " << entry_point_name << "(const __global float input"
<< array_dims(input_shape) << ", const __global float gamma" << array_dims(gamma_shape) << array_dims(input_shape) << ", const __global float gamma" << array_dims(channel_shape)
<< ", const __global float beta" << array_dims(gamma_shape) << ", const __global float beta" << array_dims(channel_shape)
<< ", const __global float mean" << array_dims(gamma_shape) << ", const __global float mean" << array_dims(channel_shape)
<< ", const __global float variance" << array_dims(gamma_shape) << ", const __global float variance" << array_dims(channel_shape)
<< ", __global float output" << array_dims(output_shape) << ")\n"; << ", __global float output" << array_dims(input_shape) << ")\n";
writer.block_begin(); writer.block_begin();
{ // Main function body { // Main function body
gws = generate_loops(writer, output_shape, true); gws = generate_loops(writer, input_shape, true);
writer << "float normalized = (input" << access_dims(input_shape) << " - mean[i" writer << "float normalized = (input" << access_dims(input_shape) << " - mean[i"
<< channel_axis << "]) / (" << channel_axis << "]) / ("
<< "sqrt(variance[i" << channel_axis << "] + " << eps << ")" << "sqrt(variance[i" << channel_axis << "] + " << eps << ")"
<< ");\n"; << ");\n";
writer << "output" << access_dims(output_shape) << " = normalized * gamma[i" << channel_axis writer << "output" << access_dims(input_shape) << " = normalized * gamma[i" << channel_axis
<< "] + beta[i" << channel_axis << "];\n"; << "] + beta[i" << channel_axis << "];\n";
generate_loops(writer, output_shape, false); generate_loops(writer, input_shape, false);
} // Main function body } // Main function body
writer.block_end(); writer.block_end();
...@@ -255,3 +251,154 @@ void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology, ...@@ -255,3 +251,154 @@ void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology,
gws); gws);
topology.add(op_batch_norm); topology.add(op_batch_norm);
} }
void runtime::intelgpu::do_create_variance_back(cldnn::topology& topology,
const string& output_name,
const element::Type& output_type,
double eps,
const string& input_name,
const Shape& input_shape,
const string& mean_name,
const string& variance_name,
const string& delta_name)
{
const Shape channel_shape = get_channel_shape(input_shape, "create_variance_back");
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, channel_shape);
const string entry_point_name = "create_variance_back_" + output_name;
codegen::CodeWriter writer;
writer << "__kernel void " << entry_point_name << "(const __global float input"
<< array_dims(input_shape) << ", const __global float delta" << array_dims(input_shape)
<< ", const __global float mean" << array_dims(channel_shape)
<< ", const __global float variance" << array_dims(channel_shape)
<< ", __global float output" << array_dims(channel_shape) << ")\n";
writer.block_begin();
{ // Main function body
// Loop for Channel axis 1
writer << "for (uint i" << channel_axis << " = 0; i" << channel_axis << " < "
<< input_shape.at(channel_axis) << "; ++i" << channel_axis << ")\n";
writer.block_begin();
{
writer << "float sum = 0.0f;\n";
size_t var_idx = 0;
// Main loops
for (auto const& i : input_shape)
{
if (var_idx != channel_axis)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i
<< "; ++i" << var_idx << ")\n";
writer.block_begin();
}
++var_idx;
}
writer << "float normalized = (input" << access_dims(input_shape) << " - mean[i"
<< channel_axis << "]) / ("
<< "sqrt(variance[i" << channel_axis << "] + " << eps << ")"
<< ");\n";
writer << "sum += normalized * delta" << access_dims(input_shape) << ";\n";
var_idx = 0;
// Closing brackets for main loops
for (auto const& i : input_shape)
{
if (var_idx != channel_axis)
{
writer.block_end();
}
++var_idx;
}
writer << "output[i" << channel_axis << "] = sum;\n";
} // Closing brackets for Channel axis loop
writer.block_end();
} // Main function body
writer.block_end();
const vector<cldnn::primitive_id>& inputs = {input_name, delta_name, mean_name, variance_name};
const cldnn::custom_gpu_primitive op_create_variance_back(output_name,
inputs,
{writer.get_code()},
entry_point_name,
get_kernel_args(4, 1),
"",
layout,
{1});
topology.add(op_create_variance_back);
}
void runtime::intelgpu::do_batch_norm_backprop_operation(cldnn::topology& topology,
const Shape& shape,
const element::Type& type,
const string& gamma_name,
const string& beta_name,
const string& input_name,
const string& mean_name,
const string& variance_name,
const string& delta_name,
double eps,
const string& output_name,
const string& output_gamma_name,
const string& output_beta_name)
{
const Shape channel_shape = get_channel_shape(shape, "batch_norm_backprop");
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(type, shape);
const string entry_point_name = "batch_norm_backprop_" + output_name;
const size_t r_axes_size = shape_size(shape) / shape_size(channel_shape);
codegen::CodeWriter writer;
vector<size_t> gws;
writer << "__kernel void " << entry_point_name << "(const __global float input"
<< array_dims(shape) << ", const __global float delta" << array_dims(shape)
<< ", const __global float mean" << array_dims(channel_shape)
<< ", const __global float variance" << array_dims(channel_shape)
<< ", const __global float gamma" << array_dims(channel_shape)
<< ", const __global float gamma_backprop" << array_dims(channel_shape)
<< ", const __global float beta_backprop" << array_dims(channel_shape)
<< ", __global float output" << array_dims(shape) << ")\n";
writer.block_begin();
{ // Main function body
// Main loops
gws = runtime::intelgpu::generate_loops(writer, shape, true);
writer << "float stddev = sqrt(variance[i" << channel_axis << "] + " << eps << ");\n";
writer << "float xhat = (input" << access_dims(shape) << " - mean[i" << channel_axis
<< "]) / stddev;\n";
writer << "float norma = gamma[i" << channel_axis << "] / stddev;\n";
writer << "output" << access_dims(shape) << " = norma * (delta" << access_dims(shape)
<< " - (xhat * gamma_backprop[i" << channel_axis << "] + beta_backprop[i"
<< channel_axis << "]) / " << r_axes_size << ");\n";
// Closing brackets for main loops
runtime::intelgpu::generate_loops(writer, shape, false);
} // Main function body
writer.block_end();
const vector<cldnn::primitive_id>& inputs = {input_name,
delta_name,
mean_name,
variance_name,
gamma_name,
output_gamma_name,
output_beta_name};
const cldnn::custom_gpu_primitive op_batch_norm_backprop(output_name,
inputs,
{writer.get_code()},
entry_point_name,
get_kernel_args(7, 1),
"",
layout,
gws);
topology.add(op_batch_norm_backprop);
}
...@@ -31,13 +31,11 @@ namespace ngraph ...@@ -31,13 +31,11 @@ namespace ngraph
// nGraph uses channels in this operation but clDNN uses full input data // nGraph uses channels in this operation but clDNN uses full input data
void do_batch_norm_operation(cldnn::topology& topology, void do_batch_norm_operation(cldnn::topology& topology,
const std::string& output_name, const std::string& output_name,
const Shape& output_shape,
const element::Type& output_type, const element::Type& output_type,
double eps, double eps,
const std::string& input_name, const std::string& input_name,
const Shape& input_shape, const Shape& input_shape,
const std::string& gamma_name, const std::string& gamma_name,
const Shape& gamma_shape,
const std::string& beta_name, const std::string& beta_name,
const std::string& mean_name, const std::string& mean_name,
const std::string& variance_name); const std::string& variance_name);
...@@ -45,19 +43,46 @@ namespace ngraph ...@@ -45,19 +43,46 @@ namespace ngraph
// This creates mean of the input matrix by Channel axis // This creates mean of the input matrix by Channel axis
void do_create_mean(cldnn::topology& topology, void do_create_mean(cldnn::topology& topology,
const std::string& output_name, const std::string& output_name,
const Shape& output_shape,
const element::Type& output_type, const element::Type& output_type,
const std::string& input_name, const std::string& input_name,
const Shape& input_shape); const Shape& input_shape,
bool backward);
// This creates mean of the input matrix by Channel axis // This creates variance of the input matrix by Channel axis
void do_create_variance(cldnn::topology& topology, void do_create_variance(cldnn::topology& topology,
const std::string& output_name, const std::string& output_name,
const Shape& output_shape,
const element::Type& output_type, const element::Type& output_type,
const std::string& input_name, const std::string& input_name,
const Shape& input_shape, const Shape& input_shape,
const std::string& mean_name); const std::string& mean_name);
// This creates variance backprop of the input matrix by Channel axis
void do_create_variance_back(cldnn::topology& topology,
const std::string& output_name,
const element::Type& output_type,
double eps,
const std::string& input_name,
const Shape& input_shape,
const std::string& mean_name,
const std::string& variance_name,
const std::string& delta_name);
// This function uses "shape" parameter as input or output Shape
// Shape of all other calculated as first axis from the left
// Example: output[ 4, 3, 2, 8 ] means out_gamma[ 3 ]
void do_batch_norm_backprop_operation(cldnn::topology& topology,
const Shape& shape,
const element::Type& type,
const std::string& gamma_name,
const std::string& beta_name,
const std::string& input_name,
const std::string& mean_name,
const std::string& variance_name,
const std::string& delta_name,
double eps,
const std::string& output_name,
const std::string& output_gamma_name,
const std::string& output_beta_name);
} }
} }
} }
...@@ -128,7 +128,8 @@ vector<size_t> runtime::intelgpu::generate_loops(codegen::CodeWriter& writer, ...@@ -128,7 +128,8 @@ vector<size_t> runtime::intelgpu::generate_loops(codegen::CodeWriter& writer,
{ {
if (is_begin) if (is_begin)
{ {
writer << "const unsigned i" << var_idx << " = get_global_id(" << var_idx << ");\n"; writer << "const unsigned i" << var_idx << " = get_global_id(" << var_idx
<< "); /*trip count " << i << "*/\n";
gws.push_back(i); gws.push_back(i);
} }
} }
......
...@@ -28,7 +28,7 @@ backwards_sigmoid ...@@ -28,7 +28,7 @@ backwards_sigmoid
backwards_sign backwards_sign
backwards_slice backwards_slice
backwards_tan backwards_tan
batchnorm_bprop_n4c3h2w2 backwards_tanh
batch_norm_one_output batch_norm_one_output
batch_norm_three_outputs batch_norm_three_outputs
broadcast_vector_rowwise_int64 broadcast_vector_rowwise_int64
...@@ -108,7 +108,6 @@ zero_sized_maximum ...@@ -108,7 +108,6 @@ zero_sized_maximum
zero_sized_minimum zero_sized_minimum
zero_sized_multiply zero_sized_multiply
zero_sized_negative zero_sized_negative
zero_sized_not
zero_sized_not_equal zero_sized_not_equal
zero_sized_power zero_sized_power
zero_sized_sign zero_sized_sign
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment