Commit 134b0ae2 authored by shssf's avatar shssf Committed by Scott Cyphers

IntelGPU backend: BatchNorm, Dot, Pad operations optimization (#1393)

parent 9c1c5b59
...@@ -216,6 +216,7 @@ void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology, ...@@ -216,6 +216,7 @@ void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology,
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape); const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const string entry_point_name = "batch_norm_" + output_name; const string entry_point_name = "batch_norm_" + output_name;
codegen::CodeWriter writer; codegen::CodeWriter writer;
vector<size_t> gws;
writer << "__kernel void " << entry_point_name << "( const __global float input" writer << "__kernel void " << entry_point_name << "( const __global float input"
<< array_dims(input_shape) << ", const __global float gamma" << array_dims(gamma_shape) << array_dims(input_shape) << ", const __global float gamma" << array_dims(gamma_shape)
...@@ -227,45 +228,17 @@ void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology, ...@@ -227,45 +228,17 @@ void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology,
writer.block_begin(); writer.block_begin();
{ // Main function body { // Main function body
// Loop for Channel axis 1 gws = generate_loops(writer, output_shape, true);
writer << "for (uint i" << channel_axis << " = 0; i" << channel_axis << " < "
<< output_shape.at(channel_axis) << "; ++i" << channel_axis << ")\n";
writer.block_begin();
{
size_t var_idx = 0;
// Main loops
for (auto const& i : output_shape)
{
if (var_idx != channel_axis)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i
<< "; ++i" << var_idx << ")\n";
writer.block_begin();
}
++var_idx;
}
writer << "float normalized = (input" << access_dims(input_shape) << " - mean[i" writer << "float normalized = (input" << access_dims(input_shape) << " - mean[i"
<< channel_axis << "]) / (" << channel_axis << "]) / ("
<< "sqrt(variance[i" << channel_axis << "] + " << eps << ")" << "sqrt(variance[i" << channel_axis << "] + " << eps << ")"
<< ");\n"; << ");\n";
writer << "output" << access_dims(output_shape) << " = normalized * gamma[i" writer << "output" << access_dims(output_shape) << " = normalized * gamma[i" << channel_axis
<< channel_axis << "] + beta[i" << channel_axis << "];\n"; << "] + beta[i" << channel_axis << "];\n";
var_idx = 0;
// Closing brackets for main loops
for (auto const& i : output_shape)
{
if (var_idx != channel_axis)
{
writer.block_end();
}
++var_idx;
}
} // Closing brackets for Channel axis loop generate_loops(writer, output_shape, false);
writer.block_end();
} // Main function body } // Main function body
writer.block_end(); writer.block_end();
...@@ -279,6 +252,6 @@ void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology, ...@@ -279,6 +252,6 @@ void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology,
get_kernel_args(5, 1), get_kernel_args(5, 1),
"", "",
layout, layout,
{1}); gws);
topology.add(op_batch_norm); topology.add(op_batch_norm);
} }
...@@ -18,7 +18,6 @@ ...@@ -18,7 +18,6 @@
#include <CPP/custom_gpu_primitive.hpp> #include <CPP/custom_gpu_primitive.hpp>
#include <CPP/reshape.hpp> #include <CPP/reshape.hpp>
#include "ngraph/runtime/intelgpu/code_writer.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp" #include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp" #include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
...@@ -88,7 +87,9 @@ string ...@@ -88,7 +87,9 @@ string
return buffer; return buffer;
} }
static vector<size_t> generate_loops(codegen::CodeWriter& writer, const Shape& shape, bool is_begin) vector<size_t> runtime::intelgpu::generate_loops(codegen::CodeWriter& writer,
const Shape& shape,
bool is_begin)
{ {
const size_t cldnn_gws_lim = 3; const size_t cldnn_gws_lim = 3;
vector<size_t> gws; vector<size_t> gws;
...@@ -170,6 +171,7 @@ void runtime::intelgpu::do_pad_operation(cldnn::topology& topology, ...@@ -170,6 +171,7 @@ void runtime::intelgpu::do_pad_operation(cldnn::topology& topology,
{ {
const string entry_point_name = "op_pad_kernel_" + output_name; const string entry_point_name = "op_pad_kernel_" + output_name;
codegen::CodeWriter writer; codegen::CodeWriter writer;
vector<size_t> gws;
// The kernel name and parameters // The kernel name and parameters
writer << "__kernel void " << entry_point_name << "(const __global float input" writer << "__kernel void " << entry_point_name << "(const __global float input"
...@@ -179,26 +181,16 @@ void runtime::intelgpu::do_pad_operation(cldnn::topology& topology, ...@@ -179,26 +181,16 @@ void runtime::intelgpu::do_pad_operation(cldnn::topology& topology,
writer.block_begin(); writer.block_begin();
{ {
// Loop for Broadcast scalar over full output tensor // Loop for Broadcast scalar over full output tensor
size_t var_idx = 0; gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
for (auto const& i : output_shape)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
writer << "output" << access_dims(output_shape) << " = scalar[0];\n"; writer << "output" << access_dims(output_shape) << " = scalar[0];\n";
// Closing brackets for Broadcast loop // Closing brackets for Broadcast loop
for (auto const& i : output_shape) runtime::intelgpu::generate_loops(writer, output_shape, false);
{
writer.block_end();
}
// Loop for Copy input matrix into output matrix with padding. // Loop for Copy input matrix into output matrix with padding.
// Padding include "pad_below" and "pad_interior" according nGraph documentation // Padding include "pad_below" and "pad_interior" according nGraph documentation
var_idx = 0; size_t var_idx = 0;
for (auto const& i : input_shape) for (auto const& i : input_shape)
{ {
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i" writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
...@@ -220,15 +212,15 @@ void runtime::intelgpu::do_pad_operation(cldnn::topology& topology, ...@@ -220,15 +212,15 @@ void runtime::intelgpu::do_pad_operation(cldnn::topology& topology,
writer.block_end(); writer.block_end();
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape); const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const cldnn::custom_gpu_primitive op_scalar(output_name, const cldnn::custom_gpu_primitive op_pad(output_name,
{input_name, scalar_name}, {input_name, scalar_name},
{writer.get_code()}, {writer.get_code()},
entry_point_name, entry_point_name,
get_kernel_args(2, 1), get_kernel_args(2, 1),
"", "",
layout, layout,
{1}); gws);
topology.add(op_scalar); topology.add(op_pad);
} }
static void do_1d_scalar_mul(codegen::CodeWriter& writer, static void do_1d_scalar_mul(codegen::CodeWriter& writer,
...@@ -256,7 +248,7 @@ static void do_1d_scalar_mul(codegen::CodeWriter& writer, ...@@ -256,7 +248,7 @@ static void do_1d_scalar_mul(codegen::CodeWriter& writer,
writer.block_end(); writer.block_end();
} }
static void do_2d_2d_mul(codegen::CodeWriter& writer, static vector<size_t> do_2d_2d_mul(codegen::CodeWriter& writer,
string& kernel_name, string& kernel_name,
const Shape& shapeA, const Shape& shapeA,
const Shape& shapeB, const Shape& shapeB,
...@@ -264,6 +256,7 @@ static void do_2d_2d_mul(codegen::CodeWriter& writer, ...@@ -264,6 +256,7 @@ static void do_2d_2d_mul(codegen::CodeWriter& writer,
{ {
const size_t colrow = shapeA.at(1); const size_t colrow = shapeA.at(1);
kernel_name += "_do_2d_2d_mul"; kernel_name += "_do_2d_2d_mul";
vector<size_t> gws;
writer << "__kernel void " << kernel_name << "(const __global float inputA" writer << "__kernel void " << kernel_name << "(const __global float inputA"
<< runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB" << runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB"
...@@ -273,13 +266,7 @@ static void do_2d_2d_mul(codegen::CodeWriter& writer, ...@@ -273,13 +266,7 @@ static void do_2d_2d_mul(codegen::CodeWriter& writer,
{ {
size_t var_idx = 0; size_t var_idx = 0;
// Main loops // Main loops
for (auto const& i : shapeZ) gws = runtime::intelgpu::generate_loops(writer, shapeZ, true);
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
// Inner loop // Inner loop
writer << "float sum = 0.0f;\n"; writer << "float sum = 0.0f;\n";
...@@ -292,15 +279,14 @@ static void do_2d_2d_mul(codegen::CodeWriter& writer, ...@@ -292,15 +279,14 @@ static void do_2d_2d_mul(codegen::CodeWriter& writer,
writer << "output[i0][i1] = sum;\n"; writer << "output[i0][i1] = sum;\n";
// Closing brackets for main loops // Closing brackets for main loops
for (auto const& i : shapeZ) runtime::intelgpu::generate_loops(writer, shapeZ, false);
{
writer.block_end();
}
} }
writer.block_end(); writer.block_end();
return gws;
} }
static void do_3d_3d_mul(codegen::CodeWriter& writer, static vector<size_t> do_3d_3d_mul(codegen::CodeWriter& writer,
string& kernel_name, string& kernel_name,
const Shape& shapeA, const Shape& shapeA,
const Shape& shapeB, const Shape& shapeB,
...@@ -308,6 +294,7 @@ static void do_3d_3d_mul(codegen::CodeWriter& writer, ...@@ -308,6 +294,7 @@ static void do_3d_3d_mul(codegen::CodeWriter& writer,
{ {
const size_t colrow = shapeA.back(); const size_t colrow = shapeA.back();
kernel_name += "_do_3d_3d_mul"; kernel_name += "_do_3d_3d_mul";
vector<size_t> gws;
writer << "__kernel void " << kernel_name << "(const __global float inputA" writer << "__kernel void " << kernel_name << "(const __global float inputA"
<< runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB" << runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB"
...@@ -317,13 +304,7 @@ static void do_3d_3d_mul(codegen::CodeWriter& writer, ...@@ -317,13 +304,7 @@ static void do_3d_3d_mul(codegen::CodeWriter& writer,
{ {
size_t var_idx = 0; size_t var_idx = 0;
// Main loops // Main loops
for (auto const& i : shapeZ) gws = runtime::intelgpu::generate_loops(writer, shapeZ, true);
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
// Inner loop // Inner loop
writer << "float sum = 0.0f;\n"; writer << "float sum = 0.0f;\n";
...@@ -336,15 +317,14 @@ static void do_3d_3d_mul(codegen::CodeWriter& writer, ...@@ -336,15 +317,14 @@ static void do_3d_3d_mul(codegen::CodeWriter& writer,
writer << "output[i0][i1][i2][i3] = sum;\n"; writer << "output[i0][i1][i2][i3] = sum;\n";
// Closing brackets for main loops // Closing brackets for main loops
for (auto const& i : shapeZ) runtime::intelgpu::generate_loops(writer, shapeZ, false);
{
writer.block_end();
}
} }
writer.block_end(); writer.block_end();
return gws;
} }
static void do_3d_2d_mul(codegen::CodeWriter& writer, static vector<size_t> do_3d_2d_mul(codegen::CodeWriter& writer,
string& kernel_name, string& kernel_name,
const Shape& shapeA, const Shape& shapeA,
const Shape& shapeB, const Shape& shapeB,
...@@ -352,6 +332,7 @@ static void do_3d_2d_mul(codegen::CodeWriter& writer, ...@@ -352,6 +332,7 @@ static void do_3d_2d_mul(codegen::CodeWriter& writer,
{ {
const size_t colrow = shapeA.back(); const size_t colrow = shapeA.back();
kernel_name += "_do_3d_2d_mul"; kernel_name += "_do_3d_2d_mul";
vector<size_t> gws;
writer << "__kernel void " << kernel_name << "(const __global float inputA" writer << "__kernel void " << kernel_name << "(const __global float inputA"
<< runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB" << runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB"
...@@ -361,13 +342,7 @@ static void do_3d_2d_mul(codegen::CodeWriter& writer, ...@@ -361,13 +342,7 @@ static void do_3d_2d_mul(codegen::CodeWriter& writer,
{ {
size_t var_idx = 0; size_t var_idx = 0;
// Main loops // Main loops
for (auto const& i : shapeZ) gws = runtime::intelgpu::generate_loops(writer, shapeZ, true);
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
// Inner loop // Inner loop
writer << "float sum = 0.0f;\n"; writer << "float sum = 0.0f;\n";
...@@ -380,33 +355,34 @@ static void do_3d_2d_mul(codegen::CodeWriter& writer, ...@@ -380,33 +355,34 @@ static void do_3d_2d_mul(codegen::CodeWriter& writer,
writer << "output[i0][i1][i2] = sum;\n"; writer << "output[i0][i1][i2] = sum;\n";
// Closing brackets for main loops // Closing brackets for main loops
for (auto const& i : shapeZ) runtime::intelgpu::generate_loops(writer, shapeZ, false);
{
writer.block_end();
}
} }
writer.block_end(); writer.block_end();
return gws;
} }
static void do_2d_1d_mul(codegen::CodeWriter& writer, static vector<size_t> do_2d_1d_mul(codegen::CodeWriter& writer,
string& kernel_name, string& kernel_name,
const Shape& shapeA, const Shape& shapeA,
const Shape& shapeB) const Shape& shapeB,
const Shape& shapeZ)
{ {
const size_t rows = shapeA.at(0);
const size_t colrow = shapeA.at(1); const size_t colrow = shapeA.at(1);
kernel_name += "_do_2d_1d_mul"; kernel_name += "_do_2d_1d_mul";
vector<size_t> gws;
writer << "__kernel void " << kernel_name << "(const __global float inputA" writer << "__kernel void " << kernel_name << "(const __global float inputA"
<< runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB" << runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB"
<< runtime::intelgpu::array_dims(shapeB) << ", __global float output" << runtime::intelgpu::array_dims(shapeB) << ", __global float output"
<< runtime::intelgpu::array_dims({rows}) << ")\n"; << runtime::intelgpu::array_dims(shapeZ) << ")\n";
writer.block_begin();
{
writer << "for (uint i0 = 0; i0 < " << rows << "; ++i0)\n";
writer.block_begin(); writer.block_begin();
{ {
// Main loops
gws = runtime::intelgpu::generate_loops(writer, shapeZ, true);
writer << "float sum = 0.0f;\n"; writer << "float sum = 0.0f;\n";
// Inner loop
writer << "for (uint i1 = 0; i1 < " << colrow << "; ++i1)\n"; writer << "for (uint i1 = 0; i1 < " << colrow << "; ++i1)\n";
writer.block_begin(); writer.block_begin();
{ {
...@@ -414,10 +390,13 @@ static void do_2d_1d_mul(codegen::CodeWriter& writer, ...@@ -414,10 +390,13 @@ static void do_2d_1d_mul(codegen::CodeWriter& writer,
} }
writer.block_end(); writer.block_end();
writer << "output[i0] = sum;\n"; writer << "output[i0] = sum;\n";
// Closing brackets for main loops
runtime::intelgpu::generate_loops(writer, shapeZ, false);
} }
writer.block_end(); writer.block_end();
}
writer.block_end(); return gws;
} }
static void do_scalar_scalar_mul(codegen::CodeWriter& writer, string& kernel_name) static void do_scalar_scalar_mul(codegen::CodeWriter& writer, string& kernel_name)
...@@ -473,6 +452,7 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology, ...@@ -473,6 +452,7 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology,
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape); const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
string entry_point_name = "dot_" + output_name; string entry_point_name = "dot_" + output_name;
codegen::CodeWriter writer; codegen::CodeWriter writer;
vector<size_t> gws = {1};
const bool A_is_scalar = inputA_shape.empty(); const bool A_is_scalar = inputA_shape.empty();
const bool B_is_scalar = inputB_shape.empty(); const bool B_is_scalar = inputB_shape.empty();
...@@ -494,19 +474,19 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology, ...@@ -494,19 +474,19 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology,
{ {
if (inputA_shape.size() == 2 && inputB_shape.size() == 1) if (inputA_shape.size() == 2 && inputB_shape.size() == 1)
{ {
do_2d_1d_mul(writer, entry_point_name, inputA_shape, inputB_shape); gws = do_2d_1d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape);
} }
else if (inputA_shape.size() == 2 && inputB_shape.size() == 2) else if (inputA_shape.size() == 2 && inputB_shape.size() == 2)
{ {
do_2d_2d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape); gws = do_2d_2d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape);
} }
else if (inputA_shape.size() == 3 && inputB_shape.size() == 3) else if (inputA_shape.size() == 3 && inputB_shape.size() == 3)
{ {
do_3d_3d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape); gws = do_3d_3d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape);
} }
else if (inputA_shape.size() == 3 && inputB_shape.size() == 2) else if (inputA_shape.size() == 3 && inputB_shape.size() == 2)
{ {
do_3d_2d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape); gws = do_3d_2d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape);
} }
else else
{ {
...@@ -518,7 +498,6 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology, ...@@ -518,7 +498,6 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology,
do_dot_operation_error(inputA_shape, inputB_shape, output_shape); do_dot_operation_error(inputA_shape, inputB_shape, output_shape);
} }
//cout << writer.get_code() << endl;
const cldnn::custom_gpu_primitive op_dot(output_name, const cldnn::custom_gpu_primitive op_dot(output_name,
{inputA_name, inputB_name}, {inputA_name, inputB_name},
{writer.get_code()}, {writer.get_code()},
...@@ -526,7 +505,7 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology, ...@@ -526,7 +505,7 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology,
get_kernel_args(2, 1), get_kernel_args(2, 1),
"", "",
layout, layout,
{1}); gws);
topology.add(op_dot); topology.add(op_dot);
} }
......
...@@ -18,6 +18,8 @@ ...@@ -18,6 +18,8 @@
#include <CPP/topology.hpp> #include <CPP/topology.hpp>
#include "ngraph/runtime/intelgpu/code_writer.hpp"
#include "ngraph/axis_set.hpp" #include "ngraph/axis_set.hpp"
#include "ngraph/coordinate.hpp" #include "ngraph/coordinate.hpp"
#include "ngraph/shape.hpp" #include "ngraph/shape.hpp"
...@@ -96,6 +98,8 @@ namespace ngraph ...@@ -96,6 +98,8 @@ namespace ngraph
std::string access_dims(const Shape& dimentions, std::string access_dims(const Shape& dimentions,
const AxisSet& axis = {}, const AxisSet& axis = {},
bool is_reversed = false); bool is_reversed = false);
std::vector<size_t>
generate_loops(codegen::CodeWriter& writer, const Shape& shape, bool is_begin);
} }
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment