Commit 92adea38 authored by shssf's avatar shssf Committed by Scott Cyphers

IntelGPU backend: Sum and redeveloped Broadcast operation (#1276)

parent cb84305e
......@@ -289,6 +289,8 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
const string& output_name = op->get_outputs().begin()->get_tensor().get_name();
const Shape& output_shape = op->get_outputs().begin()->get_shape();
const element::Type& output_type =
op->get_outputs().begin()->get_tensor().get_element_type();
const shared_ptr<op::Broadcast> broadcast = static_pointer_cast<op::Broadcast>(op);
const AxisSet& axis = broadcast->get_broadcast_axes();
......@@ -297,10 +299,67 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
{
do_equal_propagation(topology, input_name, output_name);
}
else if (input_shape.empty())
{
do_bcast_sum_operation_scalar(topology,
input_name,
input_shape,
output_name,
output_shape,
output_type,
true);
}
else
{
do_bcast_sum_operation(topology,
input_name,
input_shape,
output_name,
output_shape,
output_type,
axis,
true);
}
}
else if ("Sum" == op->description())
{
arguments_check(op, 1, 1);
const string& input_name = op->get_inputs().begin()->get_tensor().get_name();
const Shape& input_shape = op->get_inputs().begin()->get_shape();
const string& output_name = op->get_outputs().begin()->get_tensor().get_name();
const Shape& output_shape = op->get_outputs().begin()->get_shape();
const element::Type& output_type =
op->get_outputs().begin()->get_tensor().get_element_type();
const shared_ptr<op::Sum> sum = static_pointer_cast<op::Sum>(op);
const AxisSet& axis = sum->get_reduction_axes();
if (axis.empty())
{
do_equal_propagation(topology, input_name, output_name);
}
else if (output_shape.empty())
{
do_bcast_sum_operation_scalar(topology,
input_name,
input_shape,
output_name,
output_shape,
output_type,
false);
}
else
{
do_broadcast_operation(
topology, input_name, input_shape, output_name, output_shape, axis);
do_bcast_sum_operation(topology,
input_name,
input_shape,
output_name,
output_shape,
output_type,
axis,
false);
}
}
else if ("Reshape" == op->description())
......
......@@ -15,8 +15,10 @@
*******************************************************************************/
#include <CPP/concatenation.hpp>
#include <CPP/custom_gpu_primitive.hpp>
#include <CPP/reshape.hpp>
#include "ngraph/runtime/intelgpu/code_writer.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp"
......@@ -25,160 +27,150 @@
using namespace std;
using namespace ngraph;
static const string reshape_suf("_reshape");
static vector<cldnn_arg> parameters_1inp_1out = {{arg_input, 0}, {arg_output, 0}};
static Shape propagate_backward(const Shape& input)
static string array_dims(const Shape& dimentions)
{
Shape result({0, 0, 0, 0});
size_t idx = result.size() - 1;
string buffer;
for (auto i = input.crbegin(); i != input.crend(); ++i, --idx)
for (auto const& dim : dimentions)
{
result.at(idx) = *i;
buffer += "[" + to_string(dim) + "]";
}
return result;
return buffer;
}
static Shape propagate_forward(const Shape& input)
static string access_dims(const Shape& dimentions, const AxisSet& axis = {})
{
Shape result({0, 0, 0, 0});
size_t idx = 0;
size_t var_idx = 0;
string buffer;
for (auto i = input.cbegin(); i != input.cend(); ++i, ++idx)
for (auto const& i : dimentions)
{
result.at(idx) = *i;
}
return result;
}
static Shape apply_axis(const Shape& input, const AxisSet& axis)
{
Shape result = input;
for (auto const& i : axis)
{
result.at(i) = 0;
if (axis.find(var_idx) == axis.end())
{
buffer += "[i" + to_string(var_idx) + "]";
}
++var_idx;
}
return result;
return buffer;
}
// This function broadcast input data to all other dimensions of the output
// it operates in two mode only (controlled by is_forward flag):
// [forward]: propagate data from left to right in Shape array term
// in[2], out[2,3,4,5], axis[1,2,3]
// [backward]: propagate data from right to left in Shape array term
// in[5], out[2,3,4,5], axis[0,1,2]
// Input and output shapes can be up to 4 dimensions
// Other variants, like: in[4] out[2,3,4,5] axis[0,1,3], unsupported yet
static void do_propagation(cldnn::topology& topology,
const string& input_name,
const Shape& input_shape,
const string& output_name,
const Shape& output_shape,
const AxisSet& axis,
bool is_forward)
void runtime::intelgpu::do_bcast_sum_operation_scalar(cldnn::topology& topology,
const string& input_name,
const Shape& input_shape,
const string& output_name,
const Shape& output_shape,
const element::Type& output_type,
bool is_bcast)
{
//default value used in "forward" mode
cldnn::concatenation::concatenation_axis direction =
runtime::intelgpu::IntelGPULayout::get_cldnn_axis(3);
string input_name_it = input_name;
string output_name_it = output_name;
Shape input_shape_it = input_shape;
for (auto axis_id = axis.crbegin(); axis_id != axis.crend();)
const string function_name = is_bcast ? "broadcast_scalar" : "sum_scalar";
const size_t input_count =
is_bcast ? shape_size<Shape>(output_shape) : shape_size<Shape>(input_shape);
codegen::CodeWriter writer;
writer << "__kernel void " << function_name
<< "(const __global float* input, __global float* output)\n";
writer.block_begin();
{
const size_t input_count = output_shape.at(*axis_id);
if (is_forward)
{
input_shape_it.push_back(1);
const cldnn::tensor my_tensor =
runtime::intelgpu::IntelGPULayout::create_cldnn_tensor(input_shape_it);
const cldnn::reshape op_reshape(input_name_it + reshape_suf, input_name_it, my_tensor);
topology.add(op_reshape);
input_shape_it.back() = input_count;
input_name_it += reshape_suf;
}
else
{
direction = runtime::intelgpu::IntelGPULayout::get_cldnn_axis(*axis_id);
}
const vector<cldnn::primitive_id> input_names(input_count, input_name_it);
writer << "float sum = 0.f;\n"
<< "for (uint i = 0; i < COUNT; ++i)\n";
writer.block_begin();
++axis_id;
if (axis_id == axis.crend())
if (is_bcast)
{
output_name_it = output_name;
writer << "output[i] = input[0];\n";
writer.block_end();
}
else
{
output_name_it += ":_";
input_name_it = output_name_it;
writer << "sum += input[i];\n";
writer.block_end();
writer << "output[0] = sum;\n";
}
const cldnn::concatenation op_concat(output_name_it, input_names, direction);
topology.add(op_concat);
}
} // End of function bracket
writer.block_end();
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const cldnn::custom_gpu_primitive op_scalar(output_name,
{input_name},
{writer.get_code()},
function_name,
parameters_1inp_1out,
string("-DCOUNT=" + to_string(input_count)),
layout);
topology.add(op_scalar);
}
// Assume input is scalar. All output data will be populated by the scalar
// The function extremely non optimal from performance perspective
static void do_scalar_propagation(cldnn::topology& topology,
const string& input_name,
const string& output_name,
const Shape& output_shape)
{
const size_t input_count = shape_size<const Shape>(output_shape);
const vector<cldnn::primitive_id> input_names(input_count, input_name);
const cldnn::concatenation op_concat(output_name, input_names, cldnn::concatenation::along_x);
topology.add(op_concat);
}
void runtime::intelgpu::do_broadcast_operation(cldnn::topology& topology,
void runtime::intelgpu::do_bcast_sum_operation(cldnn::topology& topology,
const string& input_name,
const Shape& input_shape,
const string& output_name,
const Shape& output_shape,
const AxisSet& axis)
const element::Type& output_type,
const AxisSet& axis,
bool is_bcast)
{
if (input_shape.size() > 4 || output_shape.size() > 4)
{
throw invalid_argument("IntelGPU::Broadcast supports 4D shapes maximum.");
}
if (input_shape.empty())
{
do_scalar_propagation(topology, input_name, output_name, output_shape);
return;
}
const string function_name = is_bcast ? "broadcast" : "sum";
codegen::CodeWriter writer;
const Shape output_shape_axis = apply_axis(output_shape, axis);
const Shape input_shape_forward = propagate_forward(input_shape);
const Shape output_shape_forward = propagate_forward(output_shape_axis);
const Shape input_shape_backward = propagate_backward(input_shape);
const Shape output_shape_backward = propagate_backward(output_shape_axis);
writer << "__kernel void " << function_name << "(const __global float input"
<< array_dims(input_shape) << ", __global float output" << array_dims(output_shape)
<< ")\n";
if (input_shape_forward == output_shape_forward)
writer.block_begin();
{
do_propagation(topology, input_name, input_shape, output_name, output_shape, axis, true);
}
else if (input_shape_backward == output_shape_backward)
{
do_propagation(topology, input_name, input_shape, output_name, output_shape, axis, false);
}
else
{
ostringstream os;
os << "IntelGP::Broadcast unsupported mode. input" << vector_to_string(input_shape)
<< " output" << vector_to_string(output_shape) << " axis" << vector_to_string(axis);
throw invalid_argument(os.str());
}
if (is_bcast)
{
size_t var_idx = 0;
for (auto const& i : output_shape)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
writer << "output" << access_dims(output_shape) << " = input"
<< access_dims(output_shape, axis) << ";\n";
// Closing brackets for Broadcast loop
for (auto const& i : output_shape)
{
writer.block_end();
}
}
else
{
size_t var_idx = 0;
for (auto const& i : input_shape)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
writer << "output" << access_dims(input_shape, axis) << " += input"
<< access_dims(input_shape) << ";\n";
// Closing brackets for Sum loop
for (auto const& i : input_shape)
{
writer.block_end();
}
}
} // End of function bracket
writer.block_end();
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const cldnn::custom_gpu_primitive op_bcast_sum(output_name,
{input_name},
{writer.get_code()},
function_name,
parameters_1inp_1out,
"",
layout);
topology.add(op_bcast_sum);
}
......@@ -27,13 +27,26 @@ namespace ngraph
{
namespace intelgpu
{
// This implements Broadcast nGraph operation
void do_broadcast_operation(cldnn::topology& topology,
// This implements Broadcast and Sum nGraph operations
// in case of input_shape is not empty
void do_bcast_sum_operation(cldnn::topology& topology,
const std::string& input_name,
const Shape& input_shape,
const std::string& output_name,
const Shape& output_shape,
const AxisSet& axis);
const element::Type& output_type,
const AxisSet& axis,
bool is_bcast);
// This implements Broadcast and Sum nGraph operations
// in case of input_shape is empty
void do_bcast_sum_operation_scalar(cldnn::topology& topology,
const std::string& input_name,
const Shape& input_shape,
const std::string& output_name,
const Shape& output_shape,
const element::Type& output_type,
bool is_bcast);
}
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment