Commit 92adea38 authored by shssf's avatar shssf Committed by Scott Cyphers

IntelGPU backend: Sum and redeveloped Broadcast operation (#1276)

parent cb84305e
...@@ -289,6 +289,8 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func) ...@@ -289,6 +289,8 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
const string& output_name = op->get_outputs().begin()->get_tensor().get_name(); const string& output_name = op->get_outputs().begin()->get_tensor().get_name();
const Shape& output_shape = op->get_outputs().begin()->get_shape(); const Shape& output_shape = op->get_outputs().begin()->get_shape();
const element::Type& output_type =
op->get_outputs().begin()->get_tensor().get_element_type();
const shared_ptr<op::Broadcast> broadcast = static_pointer_cast<op::Broadcast>(op); const shared_ptr<op::Broadcast> broadcast = static_pointer_cast<op::Broadcast>(op);
const AxisSet& axis = broadcast->get_broadcast_axes(); const AxisSet& axis = broadcast->get_broadcast_axes();
...@@ -297,10 +299,67 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func) ...@@ -297,10 +299,67 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
{ {
do_equal_propagation(topology, input_name, output_name); do_equal_propagation(topology, input_name, output_name);
} }
else if (input_shape.empty())
{
do_bcast_sum_operation_scalar(topology,
input_name,
input_shape,
output_name,
output_shape,
output_type,
true);
}
else
{
do_bcast_sum_operation(topology,
input_name,
input_shape,
output_name,
output_shape,
output_type,
axis,
true);
}
}
else if ("Sum" == op->description())
{
arguments_check(op, 1, 1);
const string& input_name = op->get_inputs().begin()->get_tensor().get_name();
const Shape& input_shape = op->get_inputs().begin()->get_shape();
const string& output_name = op->get_outputs().begin()->get_tensor().get_name();
const Shape& output_shape = op->get_outputs().begin()->get_shape();
const element::Type& output_type =
op->get_outputs().begin()->get_tensor().get_element_type();
const shared_ptr<op::Sum> sum = static_pointer_cast<op::Sum>(op);
const AxisSet& axis = sum->get_reduction_axes();
if (axis.empty())
{
do_equal_propagation(topology, input_name, output_name);
}
else if (output_shape.empty())
{
do_bcast_sum_operation_scalar(topology,
input_name,
input_shape,
output_name,
output_shape,
output_type,
false);
}
else else
{ {
do_broadcast_operation( do_bcast_sum_operation(topology,
topology, input_name, input_shape, output_name, output_shape, axis); input_name,
input_shape,
output_name,
output_shape,
output_type,
axis,
false);
} }
} }
else if ("Reshape" == op->description()) else if ("Reshape" == op->description())
......
...@@ -15,8 +15,10 @@ ...@@ -15,8 +15,10 @@
*******************************************************************************/ *******************************************************************************/
#include <CPP/concatenation.hpp> #include <CPP/concatenation.hpp>
#include <CPP/custom_gpu_primitive.hpp>
#include <CPP/reshape.hpp> #include <CPP/reshape.hpp>
#include "ngraph/runtime/intelgpu/code_writer.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp" #include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp" #include "ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp"
...@@ -25,160 +27,150 @@ ...@@ -25,160 +27,150 @@
using namespace std; using namespace std;
using namespace ngraph; using namespace ngraph;
static const string reshape_suf("_reshape"); static vector<cldnn_arg> parameters_1inp_1out = {{arg_input, 0}, {arg_output, 0}};
static Shape propagate_backward(const Shape& input) static string array_dims(const Shape& dimentions)
{ {
Shape result({0, 0, 0, 0}); string buffer;
size_t idx = result.size() - 1;
for (auto i = input.crbegin(); i != input.crend(); ++i, --idx) for (auto const& dim : dimentions)
{ {
result.at(idx) = *i; buffer += "[" + to_string(dim) + "]";
} }
return result; return buffer;
} }
static Shape propagate_forward(const Shape& input) static string access_dims(const Shape& dimentions, const AxisSet& axis = {})
{ {
Shape result({0, 0, 0, 0}); size_t var_idx = 0;
size_t idx = 0; string buffer;
for (auto i = input.cbegin(); i != input.cend(); ++i, ++idx) for (auto const& i : dimentions)
{ {
result.at(idx) = *i; if (axis.find(var_idx) == axis.end())
}
return result;
}
static Shape apply_axis(const Shape& input, const AxisSet& axis)
{
Shape result = input;
for (auto const& i : axis)
{ {
result.at(i) = 0; buffer += "[i" + to_string(var_idx) + "]";
}
++var_idx;
} }
return result; return buffer;
} }
// This function broadcast input data to all other dimensions of the output void runtime::intelgpu::do_bcast_sum_operation_scalar(cldnn::topology& topology,
// it operates in two mode only (controlled by is_forward flag):
// [forward]: propagate data from left to right in Shape array term
// in[2], out[2,3,4,5], axis[1,2,3]
// [backward]: propagate data from right to left in Shape array term
// in[5], out[2,3,4,5], axis[0,1,2]
// Input and output shapes can be up to 4 dimensions
// Other variants, like: in[4] out[2,3,4,5] axis[0,1,3], unsupported yet
static void do_propagation(cldnn::topology& topology,
const string& input_name, const string& input_name,
const Shape& input_shape, const Shape& input_shape,
const string& output_name, const string& output_name,
const Shape& output_shape, const Shape& output_shape,
const AxisSet& axis, const element::Type& output_type,
bool is_forward) bool is_bcast)
{ {
//default value used in "forward" mode const string function_name = is_bcast ? "broadcast_scalar" : "sum_scalar";
cldnn::concatenation::concatenation_axis direction = const size_t input_count =
runtime::intelgpu::IntelGPULayout::get_cldnn_axis(3); is_bcast ? shape_size<Shape>(output_shape) : shape_size<Shape>(input_shape);
codegen::CodeWriter writer;
string input_name_it = input_name;
string output_name_it = output_name; writer << "__kernel void " << function_name
Shape input_shape_it = input_shape; << "(const __global float* input, __global float* output)\n";
for (auto axis_id = axis.crbegin(); axis_id != axis.crend();) writer.block_begin();
{
const size_t input_count = output_shape.at(*axis_id);
if (is_forward)
{ {
input_shape_it.push_back(1); writer << "float sum = 0.f;\n"
const cldnn::tensor my_tensor = << "for (uint i = 0; i < COUNT; ++i)\n";
runtime::intelgpu::IntelGPULayout::create_cldnn_tensor(input_shape_it); writer.block_begin();
const cldnn::reshape op_reshape(input_name_it + reshape_suf, input_name_it, my_tensor); if (is_bcast)
topology.add(op_reshape);
input_shape_it.back() = input_count;
input_name_it += reshape_suf;
}
else
{
direction = runtime::intelgpu::IntelGPULayout::get_cldnn_axis(*axis_id);
}
const vector<cldnn::primitive_id> input_names(input_count, input_name_it);
++axis_id;
if (axis_id == axis.crend())
{ {
output_name_it = output_name; writer << "output[i] = input[0];\n";
writer.block_end();
} }
else else
{ {
output_name_it += ":_"; writer << "sum += input[i];\n";
input_name_it = output_name_it; writer.block_end();
writer << "output[0] = sum;\n";
} }
} // End of function bracket
const cldnn::concatenation op_concat(output_name_it, input_names, direction); writer.block_end();
topology.add(op_concat);
} const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
} const cldnn::custom_gpu_primitive op_scalar(output_name,
{input_name},
// Assume input is scalar. All output data will be populated by the scalar {writer.get_code()},
// The function extremely non optimal from performance perspective function_name,
static void do_scalar_propagation(cldnn::topology& topology, parameters_1inp_1out,
const string& input_name, string("-DCOUNT=" + to_string(input_count)),
const string& output_name, layout);
const Shape& output_shape) topology.add(op_scalar);
{
const size_t input_count = shape_size<const Shape>(output_shape);
const vector<cldnn::primitive_id> input_names(input_count, input_name);
const cldnn::concatenation op_concat(output_name, input_names, cldnn::concatenation::along_x);
topology.add(op_concat);
} }
void runtime::intelgpu::do_broadcast_operation(cldnn::topology& topology, void runtime::intelgpu::do_bcast_sum_operation(cldnn::topology& topology,
const string& input_name, const string& input_name,
const Shape& input_shape, const Shape& input_shape,
const string& output_name, const string& output_name,
const Shape& output_shape, const Shape& output_shape,
const AxisSet& axis) const element::Type& output_type,
const AxisSet& axis,
bool is_bcast)
{ {
if (input_shape.size() > 4 || output_shape.size() > 4) const string function_name = is_bcast ? "broadcast" : "sum";
{ codegen::CodeWriter writer;
throw invalid_argument("IntelGPU::Broadcast supports 4D shapes maximum.");
}
if (input_shape.empty()) writer << "__kernel void " << function_name << "(const __global float input"
{ << array_dims(input_shape) << ", __global float output" << array_dims(output_shape)
do_scalar_propagation(topology, input_name, output_name, output_shape); << ")\n";
return; writer.block_begin();
{
if (is_bcast)
{
size_t var_idx = 0;
for (auto const& i : output_shape)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
} }
writer << "output" << access_dims(output_shape) << " = input"
<< access_dims(output_shape, axis) << ";\n";
const Shape output_shape_axis = apply_axis(output_shape, axis); // Closing brackets for Broadcast loop
const Shape input_shape_forward = propagate_forward(input_shape); for (auto const& i : output_shape)
const Shape output_shape_forward = propagate_forward(output_shape_axis);
const Shape input_shape_backward = propagate_backward(input_shape);
const Shape output_shape_backward = propagate_backward(output_shape_axis);
if (input_shape_forward == output_shape_forward)
{ {
do_propagation(topology, input_name, input_shape, output_name, output_shape, axis, true); writer.block_end();
} }
else if (input_shape_backward == output_shape_backward)
{
do_propagation(topology, input_name, input_shape, output_name, output_shape, axis, false);
} }
else else
{ {
ostringstream os; size_t var_idx = 0;
os << "IntelGP::Broadcast unsupported mode. input" << vector_to_string(input_shape) for (auto const& i : input_shape)
<< " output" << vector_to_string(output_shape) << " axis" << vector_to_string(axis); {
throw invalid_argument(os.str()); writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
writer << "output" << access_dims(input_shape, axis) << " += input"
<< access_dims(input_shape) << ";\n";
// Closing brackets for Sum loop
for (auto const& i : input_shape)
{
writer.block_end();
}
} }
} // End of function bracket
writer.block_end();
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const cldnn::custom_gpu_primitive op_bcast_sum(output_name,
{input_name},
{writer.get_code()},
function_name,
parameters_1inp_1out,
"",
layout);
topology.add(op_bcast_sum);
} }
...@@ -27,13 +27,26 @@ namespace ngraph ...@@ -27,13 +27,26 @@ namespace ngraph
{ {
namespace intelgpu namespace intelgpu
{ {
// This implements Broadcast nGraph operation // This implements Broadcast and Sum nGraph operations
void do_broadcast_operation(cldnn::topology& topology, // in case of input_shape is not empty
void do_bcast_sum_operation(cldnn::topology& topology,
const std::string& input_name, const std::string& input_name,
const Shape& input_shape, const Shape& input_shape,
const std::string& output_name, const std::string& output_name,
const Shape& output_shape, const Shape& output_shape,
const AxisSet& axis); const element::Type& output_type,
const AxisSet& axis,
bool is_bcast);
// This implements Broadcast and Sum nGraph operations
// in case of input_shape is empty
void do_bcast_sum_operation_scalar(cldnn::topology& topology,
const std::string& input_name,
const Shape& input_shape,
const std::string& output_name,
const Shape& output_shape,
const element::Type& output_type,
bool is_bcast);
} }
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment