Commit 176e105b authored by shssf's avatar shssf Committed by Scott Cyphers

IntelGPU backend: Broadcast operation optimization (#1450)

* IntelGPU backend: Broadcast operation optimization

* PR1450 style check fixed
parent 0593746d
......@@ -525,16 +525,6 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
{
do_equal_propagation(topology, get_input_name(op), get_output_name(op));
}
else if (get_input_shape(op).empty())
{
do_bcast_sum_operation_scalar(topology,
get_input_name(op),
get_input_shape(op),
get_output_name(op),
get_output_shape(op),
get_output_type(op),
true);
}
else
{
do_bcast_sum_operation(topology,
......@@ -558,16 +548,6 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
{
do_equal_propagation(topology, get_input_name(op), get_output_name(op));
}
else if (get_output_shape(op).empty())
{
do_bcast_sum_operation_scalar(topology,
get_input_name(op),
get_input_shape(op),
get_output_name(op),
get_output_shape(op),
get_output_type(op),
false);
}
else
{
do_bcast_sum_operation(topology,
......
......@@ -28,54 +28,6 @@
using namespace std;
using namespace ngraph;
void runtime::intelgpu::do_bcast_sum_operation_scalar(cldnn::topology& topology,
const string& input_name,
const Shape& input_shape,
const string& output_name,
const Shape& output_shape,
const element::Type& output_type,
bool is_bcast)
{
string function_name = is_bcast ? "broadcast_scalar" : "sum_scalar";
function_name += output_name;
const size_t input_count =
is_bcast ? shape_size<Shape>(output_shape) : shape_size<Shape>(input_shape);
codegen::CodeWriter writer;
writer << "__kernel void " << function_name
<< "(const __global float* input, __global float* output)\n";
writer.block_begin();
{
writer << "float sum = 0.f;\n"
<< "for (uint i = 0; i < COUNT; ++i)\n";
writer.block_begin();
if (is_bcast)
{
writer << "output[i] = input[0];\n";
writer.block_end();
}
else
{
writer << "sum += input[i];\n";
writer.block_end();
writer << "output[0] = sum;\n";
}
} // End of function bracket
writer.block_end();
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const cldnn::custom_gpu_primitive op_scalar(output_name,
{input_name},
{writer.get_code()},
function_name,
get_kernel_args(1, 1),
string("-DCOUNT=" + to_string(input_count)),
layout,
{1});
topology.add(op_scalar);
}
void runtime::intelgpu::do_bcast_sum_operation(cldnn::topology& topology,
const string& input_name,
const Shape& input_shape,
......@@ -85,37 +37,29 @@ void runtime::intelgpu::do_bcast_sum_operation(cldnn::topology& topology,
const AxisSet& axis,
bool is_bcast)
{
string function_name = is_bcast ? "broadcast" : "sum";
string function_name = is_bcast ? "broadcast_" : "sum_";
function_name += output_name;
codegen::CodeWriter writer;
vector<size_t> gws;
writer << "__kernel void " << function_name << "(const __global float input"
<< array_dims(input_shape) << ", __global float output" << array_dims(output_shape)
<< ")\n";
runtime::intelgpu::gen_func_def(
writer, function_name, {"float"}, {input_shape}, "float", output_shape);
writer.block_begin();
{
if (is_bcast)
{
size_t var_idx = 0;
for (auto const& i : output_shape)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
writer << "output" << access_dims(output_shape) << " = input"
// Broadcast loops
gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
writer << "output" << access_dims(output_shape) << " = input0"
<< access_dims(output_shape, axis) << ";\n";
// Closing brackets for Broadcast loop
for (auto const& i : output_shape)
{
writer.block_end();
}
runtime::intelgpu::generate_loops(writer, output_shape, false);
}
else
{
gws = {1}; // non parallel version
// Initialize destination output by zeroes
size_t var_idx = 0;
for (auto const& i : output_shape)
......@@ -144,7 +88,7 @@ void runtime::intelgpu::do_bcast_sum_operation(cldnn::topology& topology,
++var_idx;
}
writer << "output" << access_dims(input_shape, axis) << " += input"
writer << "output" << access_dims(input_shape, axis) << " += input0"
<< access_dims(input_shape) << ";\n";
// Closing brackets for Sum loop
......@@ -164,7 +108,7 @@ void runtime::intelgpu::do_bcast_sum_operation(cldnn::topology& topology,
get_kernel_args(1, 1),
"",
layout,
{1});
gws);
topology.add(op_bcast_sum);
}
......
......@@ -27,8 +27,9 @@ namespace ngraph
{
namespace intelgpu
{
// This implements Broadcast and Sum nGraph operations
// in case of input_shape is not empty
// This implements Broadcast and Sum nGraph operations.
// input_shape (bcast) or output_shape (sum) can be empty.
// If the shape is empty it means scalar
void do_bcast_sum_operation(cldnn::topology& topology,
const std::string& input_name,
const Shape& input_shape,
......@@ -38,16 +39,6 @@ namespace ngraph
const AxisSet& axis,
bool is_bcast);
// This implements Broadcast and Sum nGraph operations
// in case of input_shape is empty
void do_bcast_sum_operation_scalar(cldnn::topology& topology,
const std::string& input_name,
const Shape& input_shape,
const std::string& output_name,
const Shape& output_shape,
const element::Type& output_type,
bool is_bcast);
// This implements Min and Max operations depends on is_min parameter
void do_max_min_operation(cldnn::topology& topology,
const std::string& input_name,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment