Commit ed22bf6c authored by shssf's avatar shssf Committed by Robert Kimball

IntelGPU backend: Sum operation optimization (#1545)

* IntelGPU backend: Sum operation optimization

* PR1545. Comments addressed. Test added. Helper function refactored.
parent 75a18827
......@@ -59,43 +59,16 @@ void runtime::intelgpu::do_bcast_sum_operation(cldnn::topology& topology,
}
else
{
gws = {1}; // non parallel version
// Initialize destination output by zeroes
size_t var_idx = 0;
for (auto const& i : output_shape)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
const string reduction_str =
"output" + access_dims(input_shape, "i", axis) + " = result;\n";
writer << "output" << access_dims(output_shape) << " = 0.0f;\n";
// Generate loops related to input order with GWS
gws = generate_loops_w_axes(writer, input_shape, true, axis, "float result = 0.0f;\n");
// Closing brackets for Sum initialization loop
for (auto const& i : output_shape)
{
writer.block_end();
}
writer << "result += input0" << access_dims(input_shape) << ";\n";
// Now do the Sum operation
var_idx = 0;
for (auto const& i : input_shape)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
writer << "output" << access_dims(input_shape, "i", axis) << " += input0"
<< access_dims(input_shape) << ";\n";
// Closing brackets for Sum loop
for (auto const& i : input_shape)
{
writer.block_end();
}
// Close brackets related to input order with reduction
generate_loops_w_axes(writer, input_shape, false, axis, reduction_str);
}
} // End of function bracket
writer.block_end();
......
......@@ -159,6 +159,99 @@ vector<size_t> runtime::intelgpu::generate_loops(codegen::CodeWriter& writer,
return gws;
}
vector<size_t> runtime::intelgpu::generate_loops_w_axes(codegen::CodeWriter& writer,
const Shape& shape,
bool is_begin,
const AxisSet& axis,
const string& expression)
{
const size_t cldnn_gws_lim = 3;
vector<size_t> gws;
size_t var_idx = 0;
size_t dim_idx = 0;
if (is_begin)
{
for (auto const& i : shape)
{
if (axis.find(var_idx) == axis.end())
{
if (dim_idx < cldnn_gws_lim)
{
writer << "const unsigned i" << var_idx << " = get_global_id(" << dim_idx
<< "); /* trip count " << i << "*/\n";
gws.push_back(i);
++dim_idx;
}
else
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i
<< "; ++i" << var_idx << ")\n";
writer.block_begin();
}
}
++var_idx;
}
if (!expression.empty())
{
writer << expression;
}
var_idx = 0;
for (auto const& i : shape)
{
if (axis.find(var_idx) != axis.end())
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
}
++var_idx;
}
}
else
{ // is_begin == false
for (auto const& i : shape)
{
if (axis.find(var_idx) != axis.end())
{
writer.block_end();
}
++var_idx;
}
if (!expression.empty())
{
writer << expression;
}
var_idx = 0;
for (auto const& i : shape)
{
if (axis.find(var_idx) == axis.end())
{
if (dim_idx < cldnn_gws_lim)
{
++dim_idx;
}
else
{
writer.block_end();
}
}
++var_idx;
}
}
if (gws.empty())
{
gws.push_back(1);
}
return gws;
}
static string access_dims_strided(const Shape& dimentions,
const Shape& pad_below,
const Shape& pad_interior,
......
......@@ -157,6 +157,12 @@ namespace ngraph
bool is_reversed = false);
std::vector<size_t>
generate_loops(codegen::CodeWriter& writer, const Shape& shape, bool is_begin);
std::vector<size_t>
generate_loops_w_axes(codegen::CodeWriter& writer,
const Shape& shape,
bool is_begin,
const AxisSet& axis = {},
const std::string& expression = std::string());
void gen_func_def(codegen::CodeWriter& writer,
const std::string& entry_point_name,
const std::vector<std::string>& input_types,
......
......@@ -45,80 +45,6 @@ static Shape shape_dims(const Shape& dimentions, const AxisSet& axis = {})
return output_shape;
}
static vector<size_t> generate_loops_w_axes(codegen::CodeWriter& writer,
const Shape& shape,
bool is_begin,
const AxisSet& axis,
const string& expression)
{
const size_t cldnn_gws_lim = 3;
vector<size_t> gws;
size_t var_idx = 0;
size_t dim_idx = 0;
for (auto const& i : shape)
{
if (axis.find(var_idx) == axis.end())
{
if (dim_idx < cldnn_gws_lim)
{
if (is_begin)
{
writer << "const unsigned i" << var_idx << " = get_global_id(" << dim_idx
<< ");\n";
gws.push_back(i);
}
++dim_idx;
}
else
{
if (is_begin)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i
<< "; ++i" << var_idx << ")\n";
writer.block_begin();
}
else
{
writer.block_end();
}
}
}
++var_idx;
}
if (is_begin)
{
writer << expression;
}
var_idx = 0;
for (auto const& i : shape)
{
if (axis.find(var_idx) != axis.end())
{
if (is_begin)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
}
else
{
writer.block_end();
}
}
++var_idx;
}
if (gws.empty())
{
gws.push_back(1);
}
return gws;
}
void runtime::intelgpu::do_softmax_operation(cldnn::topology& topology,
const string& input_name,
const Shape& input_shape,
......
......@@ -3524,6 +3524,33 @@ NGRAPH_TEST(${BACKEND_NAME}, sum_matrix_columns)
EXPECT_EQ((vector<float>{1, 2, 3, 4, 5, 6}), read_vector<float>(a));
}
NGRAPH_TEST(${BACKEND_NAME}, sum_matrix_6d)
{
Shape shape_a{2, 6, 4, 5, 7, 3};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
Shape shape_rt{2, 4, 5, 3};
auto f = make_shared<Function>(make_shared<op::Sum>(A, AxisSet{1, 4}), op::ParameterVector{A});
auto backend_wrk = runtime::Backend::create("${BACKEND_NAME}");
auto backend_ref = runtime::Backend::create("INTERPRETER");
// Create some tensors for input/output
auto a_wrk = backend_wrk->create_tensor(element::f32, shape_a);
auto a_ref = backend_ref->create_tensor(element::f32, shape_a);
auto result_wrk = backend_wrk->create_tensor(element::f32, shape_rt);
auto result_ref = backend_ref->create_tensor(element::f32, shape_rt);
vector<float> inp_data(shape_size<const Shape>(shape_a));
iota(inp_data.begin(), inp_data.end(), 1);
copy_data(a_wrk, inp_data);
copy_data(a_ref, inp_data);
backend_wrk->call_with_validate(f, {result_wrk}, {a_wrk});
backend_ref->call_with_validate(f, {result_ref}, {a_ref});
EXPECT_EQ(read_vector<float>(result_ref), read_vector<float>(result_wrk));
}
NGRAPH_TEST(${BACKEND_NAME}, sum_matrix_rows)
{
Shape shape_a{3, 2};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment