Commit ca5476b3 authored by Sergey Shalnov's avatar Sergey Shalnov Committed by Scott Cyphers

IntelGPU backend: Concat operation custom kernel implemenation (#2551)

parent 5b349479
......@@ -614,34 +614,70 @@ shared_ptr<runtime::Executable>
arguments_check(op, 1, 1);
}
// All input shapes must be the same
// if shape is empty (means Shape{}) in this case treat its size as 1
const size_t ngraph_tensor_dims =
get_input_shape(op).empty() ? 1 : get_input_shape(op).size();
const shared_ptr<op::Concat> concat_op = static_pointer_cast<op::Concat>(op);
const size_t ngraph_concat_axis = concat_op->get_concatenation_axis();
vector<cldnn::primitive_id> inputs;
cldnn::concatenation::concatenation_axis cldnn_axis =
intelgpu_space::get_cldnn_axis(ngraph_tensor_dims, ngraph_concat_axis);
for (auto const& input : op->get_inputs())
if (!shape_size(get_output_shape(op)) || (get_input_type(op) != element::f32) ||
get_output_shape(op).size() > 4)
{
const Shape& input_shape = input.get_shape();
if (shape_size(input_shape))
vector<string> input_names;
vector<Shape> input_shapes;
for (auto const& input : op->get_inputs())
{
inputs.push_back(input.get_tensor().get_name());
const Shape& input_shape = input.get_tensor().get_shape();
if (shape_size(input_shape))
{
input_names.push_back(input.get_tensor().get_name());
input_shapes.push_back(input_shape);
}
}
}
if (inputs.empty())
{
do_equal_propagation(topology, get_input_name(op), get_output_name(op));
if (input_names.empty())
{
do_equal_propagation(topology, get_input_name(op), get_output_name(op));
}
else
{
do_concat_operation(topology,
input_names,
input_shapes,
get_output_name(op),
get_output_shape(op),
get_output_type(op),
ngraph_concat_axis);
}
}
else
{
const cldnn::concatenation cldnn_concat(get_output_name(op), inputs, cldnn_axis);
topology.add(cldnn_concat);
// All input shapes must be the same
// if shape is empty (means Shape{}) in this case treat its size as 1
const size_t ngraph_tensor_dims =
get_input_shape(op).empty() ? 1 : get_input_shape(op).size();
vector<cldnn::primitive_id> inputs;
cldnn::concatenation::concatenation_axis cldnn_axis =
intelgpu_space::get_cldnn_axis(ngraph_tensor_dims, ngraph_concat_axis);
for (auto const& input : op->get_inputs())
{
const Shape& input_shape = input.get_shape();
if (shape_size(input_shape))
{
inputs.push_back(input.get_tensor().get_name());
}
}
if (inputs.empty())
{
do_equal_propagation(topology, get_input_name(op), get_output_name(op));
}
else
{
const cldnn::concatenation cldnn_concat(
get_output_name(op), inputs, cldnn_axis);
topology.add(cldnn_concat);
}
}
break;
}
......
......@@ -1078,6 +1078,148 @@ void runtime::intelgpu::do_slice_operation(cldnn::topology& topology,
topology.add(op_slice);
}
void runtime::intelgpu::do_concat_operation(cldnn::topology& topology,
const vector<string>& input_names,
const vector<Shape>& input_shapes,
const string& output_name,
const Shape& output_shape,
const element::Type& output_type,
size_t concat_axis)
{
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const string kernel_type_name = get_opencl_type_name(output_type);
string entry_point_name = "concat_" + output_name;
size_t bound_below = 0;
size_t idx = 0;
vector<string>::const_iterator input_name = input_names.cbegin();
string aux_output_name;
// this is quite non optimal because cldnn::custom_gpu_primitive
// does not provide an ability to run kernels simultaneously with the same output
// Also, need to make a chain of kernels to put kernel0::output0 as kernel1::input1
// with output name kernel1::output2
for (auto const& input_shape : input_shapes)
{
string name_suffix = to_string(idx);
const string entry_point_name_suffix = entry_point_name + "_" + name_suffix;
CodeWriter writer;
vector<size_t> gws;
if (idx == 0)
{
gen_func_def(writer,
entry_point_name_suffix,
{kernel_type_name},
{input_shape},
kernel_type_name,
output_shape);
}
else
{
gen_func_def(writer,
entry_point_name_suffix,
{2, kernel_type_name},
{input_shape, output_shape},
kernel_type_name,
output_shape);
}
writer.block_begin();
{
// Main loops
gws = generate_loops(writer, output_shape, true);
writer << kernel_type_name << " input_element;\n";
size_t bound_upper = input_shape.at(concat_axis);
// copy corresponding elements of input0 into output
writer << "if (((" << bound_below << " + 0) <= i" << concat_axis << ") && (i"
<< concat_axis << " < (" << bound_below << " + " << bound_upper << ")))\n";
writer.block_begin();
{
writer << "input_element = input0";
if (input_shape.empty())
{
// it means scalar
writer << "[0]";
}
else
{
size_t var_idx = 0;
for (auto const i : input_shape)
{
if (var_idx == concat_axis)
{
writer << "[i" << var_idx << " - " << bound_below << "]";
}
else
{
writer << "[i" << var_idx << "]";
}
++var_idx;
}
}
writer << ";\n";
}
writer.block_end();
// if not a first kernel, copy input1 into output
if (idx != 0)
{
writer << "else\n";
writer.block_begin();
{
writer << "input_element = input1" << access_dims(output_shape) << ";\n";
}
writer.block_end();
}
bound_below += bound_upper;
writer << "output" << access_dims(output_shape) << " = input_element;\n";
// Closing brackets for main loops
generate_loops(writer, output_shape, false);
}
writer.block_end();
vector<cldnn::primitive_id> kernel_input;
vector<cldnn_arg> kernel_arguments;
kernel_input.push_back(*input_name);
if (idx == 0)
{
kernel_arguments = get_kernel_args(1, 1);
}
else
{
if (idx == input_shapes.size() - 1)
{
// last kernel should produce the output name as overall node required
name_suffix = "";
}
kernel_input.push_back(aux_output_name);
kernel_arguments = get_kernel_args(2, 1);
}
const cldnn::custom_gpu_primitive op_concat(output_name + name_suffix,
kernel_input,
{writer.get_code()},
entry_point_name_suffix,
kernel_arguments,
"",
layout,
gws);
topology.add(op_concat);
++input_name;
++idx;
aux_output_name = output_name + name_suffix;
}
}
void runtime::intelgpu::do_select_operation(cldnn::topology& topology,
const string& input0_name,
const Shape& input0_shape,
......
......@@ -102,6 +102,14 @@ namespace ngraph
const Coordinate& uppper_bounds,
const Strides& strides);
void do_concat_operation(cldnn::topology& topology,
const std::vector<std::string>& input_names,
const std::vector<Shape>& input_shapes,
const std::string& output_name,
const Shape& output_shape,
const element::Type& output_type,
size_t concat_axis);
void do_select_operation(cldnn::topology& topology,
const std::string& input0_name,
const Shape& input0_shape,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment