Commit f9368d45 authored by Anna Alberska's avatar Anna Alberska Committed by Robert Kimball

IntelGPU backend: Code refactoring and optimization (#1434)

* IntelGPU backend: Code refactoring and optimization

* Update gen_func_def()
parent d473eda9
......@@ -92,6 +92,28 @@ string
return buffer;
}
void runtime::intelgpu::gen_func_def(codegen::CodeWriter& writer,
const string& entry_point_name,
const vector<string>& input_types,
const vector<Shape>& input_shapes,
const string& output_type,
const Shape& output_shape)
{
writer << "__kernel void " << entry_point_name << "(";
const size_t inputs_number = input_types.size();
for (uint i = 0; i < inputs_number; ++i)
{
if (i > 0)
{
writer << ", ";
}
writer << "const __global " << input_types.at(i) << " input" << i
<< array_dims(input_shapes.at(i));
}
writer << ", __global " << output_type << " output" << array_dims(output_shape) << ")\n";
}
vector<size_t> runtime::intelgpu::generate_loops(codegen::CodeWriter& writer,
const Shape& shape,
bool is_begin)
......@@ -156,12 +178,14 @@ static string access_dims_strided(const Shape& dimentions,
return buffer;
}
static void do_dot_operation_error(const Shape& shapeA, const Shape& shapeB, const Shape& shapeZ)
static void do_dot_operation_error(const Shape& input0_shape,
const Shape& input1_shape,
const Shape& output_shape)
{
throw invalid_argument("IntelGPU Dot operation. Conbination ShapeA" +
runtime::intelgpu::array_dims(shapeA) + ", ShapeB" +
runtime::intelgpu::array_dims(shapeB) + ", ShapeOutput" +
runtime::intelgpu::array_dims(shapeZ) + " is not supported.");
throw invalid_argument("IntelGPU Dot operation. Conbination input0_shape" +
runtime::intelgpu::array_dims(input0_shape) + ", input1_shape" +
runtime::intelgpu::array_dims(input1_shape) + ", output_shape" +
runtime::intelgpu::array_dims(output_shape) + " is not supported.");
}
void runtime::intelgpu::do_pad_operation(cldnn::topology& topology,
......@@ -179,16 +203,15 @@ void runtime::intelgpu::do_pad_operation(cldnn::topology& topology,
vector<size_t> gws;
// The kernel name and parameters
writer << "__kernel void " << entry_point_name << "(const __global float input"
<< array_dims(input_shape) << ", const __global float scalar[1], __global float output"
<< array_dims(output_shape) << ")\n";
runtime::intelgpu::gen_func_def(
writer, entry_point_name, {2, "float"}, {input_shape, {1}}, "float", output_shape);
writer.block_begin();
{
// Loop for Broadcast scalar over full output tensor
gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
writer << "output" << access_dims(output_shape) << " = scalar[0];\n";
writer << "output" << access_dims(output_shape) << " = input1[0];\n";
// Closing brackets for Broadcast loop
runtime::intelgpu::generate_loops(writer, output_shape, false);
......@@ -205,7 +228,7 @@ void runtime::intelgpu::do_pad_operation(cldnn::topology& topology,
}
writer << "output" << access_dims_strided(input_shape, pad_below, pad_interior, true)
<< " = input" << access_dims(input_shape) << ";\n";
<< " = input0" << access_dims(input_shape) << ";\n";
// Closing brackets for main Copy loop
for (auto const& i : input_shape)
......@@ -247,9 +270,8 @@ void runtime::intelgpu::do_max_pool_backprop_operation(cldnn::topology& topology
vector<size_t> gws;
// The kernel name and parameters
writer << "__kernel void " << entry_point_name << "(const __global float input"
<< array_dims(input_shape) << ", const __global float delta" << array_dims(delta_shape)
<< ", __global float output" << array_dims(output_shape) << ")\n";
runtime::intelgpu::gen_func_def(
writer, entry_point_name, {2, "float"}, {input_shape, delta_shape}, "float", output_shape);
writer.block_begin();
{
......@@ -335,7 +357,7 @@ void runtime::intelgpu::do_max_pool_backprop_operation(cldnn::topology& topology
writer << ")\n";
writer.block_begin();
{
writer << "const float max_local = input[i0][i1]";
writer << "const float max_local = input0[i0][i1]";
// additional dimensions for input
for (size_t i = 0; i < win_shape.size(); ++i)
{
......@@ -376,7 +398,7 @@ void runtime::intelgpu::do_max_pool_backprop_operation(cldnn::topology& topology
{
writer << "[save_i" << i + 2 << "]";
}
writer << " += delta" << access_dims(delta_shape) << ";\n";
writer << " += input1" << access_dims(delta_shape) << ";\n";
} // End of elem_exists condition
writer.block_end();
// Closing brackets for delta loop
......@@ -405,24 +427,24 @@ void runtime::intelgpu::do_max_pool_backprop_operation(cldnn::topology& topology
}
static void do_1d_scalar_mul(codegen::CodeWriter& writer,
string& kernel_name,
const Shape& shapeA,
const Shape& shapeB)
string& entry_point_name,
const Shape& input0_shape,
const Shape& input1_shape)
{
const size_t countA = shapeA.empty() ? 0 : shape_size<Shape>(shapeA);
const size_t countB = shapeB.empty() ? 0 : shape_size<Shape>(shapeB);
const size_t countZ = max(countA, countB);
kernel_name += "_do_1d_scalar_mul";
const size_t input0_count = input0_shape.empty() ? 0 : shape_size<Shape>(input0_shape);
const size_t input1_count = input1_shape.empty() ? 0 : shape_size<Shape>(input1_shape);
const size_t output_count = max(input0_count, input1_count);
entry_point_name += "_do_1d_scalar_mul";
writer << "__kernel void " << kernel_name << "(const __global float* inputA"
<< ", const __global float* inputB, __global float* output)\n";
writer << "__kernel void " << entry_point_name << "(const __global float* input0"
<< ", const __global float* input1, __global float* output)\n";
writer.block_begin();
{
writer << "for (uint i1 = 0; i1 < " << countZ << "; ++i1)\n";
writer << "for (uint i1 = 0; i1 < " << output_count << "; ++i1)\n";
writer.block_begin();
{
writer << "output[i1] = inputA[" << (countA > 0 ? "i1" : "0") << "] * inputB["
<< (countB > 0 ? "i1" : "0") << "];\n";
writer << "output[i1] = input0[" << (input0_count > 0 ? "i1" : "0") << "] * input1["
<< (input1_count > 0 ? "i1" : "0") << "];\n";
}
writer.block_end();
}
......@@ -430,37 +452,38 @@ static void do_1d_scalar_mul(codegen::CodeWriter& writer,
}
static vector<size_t> do_2d_2d_mul(codegen::CodeWriter& writer,
string& kernel_name,
const Shape& shapeA,
const Shape& shapeB,
const Shape& shapeZ)
string& entry_point_name,
const Shape& input0_shape,
const Shape& input1_shape,
const Shape& output_shape)
{
const size_t colrow = shapeA.at(1);
kernel_name += "_do_2d_2d_mul";
entry_point_name += "_do_2d_2d_mul";
vector<size_t> gws;
writer << "__kernel void " << kernel_name << "(const __global float inputA"
<< runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB"
<< runtime::intelgpu::array_dims(shapeB) << ", __global float output"
<< runtime::intelgpu::array_dims(shapeZ) << ")\n";
runtime::intelgpu::gen_func_def(writer,
entry_point_name,
{2, "float"},
{input0_shape, input1_shape},
"float",
output_shape);
writer.block_begin();
{
size_t var_idx = 0;
// Main loops
gws = runtime::intelgpu::generate_loops(writer, shapeZ, true);
gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
// Inner loop
writer << "float sum = 0.0f;\n";
writer << "for (uint i2 = 0; i2 < " << colrow << "; ++i2)\n";
writer << "for (uint i2 = 0; i2 < " << input0_shape.at(1) << "; ++i2)\n";
writer.block_begin();
{
writer << "sum += inputA[i0][i2] * inputB[i2][i1];\n";
writer << "sum += input0[i0][i2] * input1[i2][i1];\n";
}
writer.block_end();
writer << "output[i0][i1] = sum;\n";
// Closing brackets for main loops
runtime::intelgpu::generate_loops(writer, shapeZ, false);
runtime::intelgpu::generate_loops(writer, output_shape, false);
}
writer.block_end();
......@@ -468,37 +491,38 @@ static vector<size_t> do_2d_2d_mul(codegen::CodeWriter& writer,
}
static vector<size_t> do_3d_3d_mul(codegen::CodeWriter& writer,
string& kernel_name,
const Shape& shapeA,
const Shape& shapeB,
const Shape& shapeZ)
string& entry_point_name,
const Shape& input0_shape,
const Shape& input1_shape,
const Shape& output_shape)
{
const size_t colrow = shapeA.back();
kernel_name += "_do_3d_3d_mul";
entry_point_name += "_do_3d_3d_mul";
vector<size_t> gws;
writer << "__kernel void " << kernel_name << "(const __global float inputA"
<< runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB"
<< runtime::intelgpu::array_dims(shapeB) << ", __global float output"
<< runtime::intelgpu::array_dims(shapeZ) << ")\n";
runtime::intelgpu::gen_func_def(writer,
entry_point_name,
{2, "float"},
{input0_shape, input1_shape},
"float",
output_shape);
writer.block_begin();
{
size_t var_idx = 0;
// Main loops
gws = runtime::intelgpu::generate_loops(writer, shapeZ, true);
gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
// Inner loop
writer << "float sum = 0.0f;\n";
writer << "for (uint i4 = 0; i4 < " << colrow << "; ++i4)\n";
writer << "for (uint i4 = 0; i4 < " << input0_shape.back() << "; ++i4)\n";
writer.block_begin();
{
writer << "sum += inputA[i0][i1][i4] * inputB[i4][i2][i3];\n";
writer << "sum += input0[i0][i1][i4] * input1[i4][i2][i3];\n";
}
writer.block_end();
writer << "output[i0][i1][i2][i3] = sum;\n";
// Closing brackets for main loops
runtime::intelgpu::generate_loops(writer, shapeZ, false);
runtime::intelgpu::generate_loops(writer, output_shape, false);
}
writer.block_end();
......@@ -506,37 +530,38 @@ static vector<size_t> do_3d_3d_mul(codegen::CodeWriter& writer,
}
static vector<size_t> do_3d_2d_mul(codegen::CodeWriter& writer,
string& kernel_name,
const Shape& shapeA,
const Shape& shapeB,
const Shape& shapeZ)
string& entry_point_name,
const Shape& input0_shape,
const Shape& input1_shape,
const Shape& output_shape)
{
const size_t colrow = shapeA.back();
kernel_name += "_do_3d_2d_mul";
entry_point_name += "_do_3d_2d_mul";
vector<size_t> gws;
writer << "__kernel void " << kernel_name << "(const __global float inputA"
<< runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB"
<< runtime::intelgpu::array_dims(shapeB) << ", __global float output"
<< runtime::intelgpu::array_dims(shapeZ) << ")\n";
runtime::intelgpu::gen_func_def(writer,
entry_point_name,
{2, "float"},
{input0_shape, input1_shape},
"float",
output_shape);
writer.block_begin();
{
size_t var_idx = 0;
// Main loops
gws = runtime::intelgpu::generate_loops(writer, shapeZ, true);
gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
// Inner loop
writer << "float sum = 0.0f;\n";
writer << "for (uint i3 = 0; i3 < " << colrow << "; ++i3)\n";
writer << "for (uint i3 = 0; i3 < " << input0_shape.back() << "; ++i3)\n";
writer.block_begin();
{
writer << "sum += inputA[i0][i1][i3] * inputB[i3][i2];\n";
writer << "sum += input0[i0][i1][i3] * input1[i3][i2];\n";
}
writer.block_end();
writer << "output[i0][i1][i2] = sum;\n";
// Closing brackets for main loops
runtime::intelgpu::generate_loops(writer, shapeZ, false);
runtime::intelgpu::generate_loops(writer, output_shape, false);
}
writer.block_end();
......@@ -544,56 +569,59 @@ static vector<size_t> do_3d_2d_mul(codegen::CodeWriter& writer,
}
static vector<size_t> do_2d_1d_mul(codegen::CodeWriter& writer,
string& kernel_name,
const Shape& shapeA,
const Shape& shapeB,
const Shape& shapeZ)
string& entry_point_name,
const Shape& input0_shape,
const Shape& input1_shape,
const Shape& output_shape)
{
const size_t colrow = shapeA.at(1);
kernel_name += "_do_2d_1d_mul";
entry_point_name += "_do_2d_1d_mul";
vector<size_t> gws;
writer << "__kernel void " << kernel_name << "(const __global float inputA"
<< runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB"
<< runtime::intelgpu::array_dims(shapeB) << ", __global float output"
<< runtime::intelgpu::array_dims(shapeZ) << ")\n";
runtime::intelgpu::gen_func_def(writer,
entry_point_name,
{2, "float"},
{input0_shape, input1_shape},
"float",
output_shape);
writer.block_begin();
{
// Main loops
gws = runtime::intelgpu::generate_loops(writer, shapeZ, true);
gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
writer << "float sum = 0.0f;\n";
// Inner loop
writer << "for (uint i1 = 0; i1 < " << colrow << "; ++i1)\n";
writer << "for (uint i1 = 0; i1 < " << input0_shape.at(1) << "; ++i1)\n";
writer.block_begin();
{
writer << "sum += inputA[i0][i1] * inputB[i1];\n";
writer << "sum += input0[i0][i1] * input1[i1];\n";
}
writer.block_end();
writer << "output[i0] = sum;\n";
// Closing brackets for main loops
runtime::intelgpu::generate_loops(writer, shapeZ, false);
runtime::intelgpu::generate_loops(writer, output_shape, false);
}
writer.block_end();
return gws;
}
static void do_scalar_scalar_mul(codegen::CodeWriter& writer, string& kernel_name)
static void do_scalar_scalar_mul(codegen::CodeWriter& writer, string& entry_point_name)
{
kernel_name += "_scalar_scalar_mul";
entry_point_name += "_scalar_scalar_mul";
runtime::intelgpu::gen_func_def(
writer, entry_point_name, {2, "float"}, {{1}, {1}}, "float", {1});
writer << "__kernel void " << kernel_name << "(const __global float inputA[1]"
<< ", const __global float inputB[1], __global float output[1])\n";
writer.block_begin();
{
writer << "output[0] = inputA[0] * inputB[0];\n";
writer << "output[0] = input0[0] * input1[0];\n";
}
writer.block_end();
}
static void do_1d_1d_mul(codegen::CodeWriter& writer, string& kernel_name, const Shape& shape)
static void do_1d_1d_mul(codegen::CodeWriter& writer, string& entry_point_name, const Shape& shape)
{
if (shape.size() > 1)
{
......@@ -601,19 +629,18 @@ static void do_1d_1d_mul(codegen::CodeWriter& writer, string& kernel_name, const
" must be 1D");
}
const size_t& size = shape.front();
kernel_name += "_do_1d_1d_mul";
entry_point_name += "_do_1d_1d_mul";
runtime::intelgpu::gen_func_def(
writer, entry_point_name, {2, "float"}, {2, shape}, "float", {1});
writer << "__kernel void " << kernel_name << "(const __global float inputA"
<< runtime::intelgpu::array_dims(shape) << ", const __global float inputB"
<< runtime::intelgpu::array_dims(shape) << ", __global float output[1])\n";
writer.block_begin();
{
writer << "float sum = 0.0f;\n"
<< "for (uint i = 0; i < " << size << "; ++i)\n";
<< "for (uint i = 0; i < " << shape.front() << "; ++i)\n";
writer.block_begin();
{
writer << "sum += inputA[i] * inputB[i];\n";
writer << "sum += input0[i] * input1[i];\n";
}
writer.block_end();
writer << "output[0] = sum;\n";
......@@ -622,10 +649,10 @@ static void do_1d_1d_mul(codegen::CodeWriter& writer, string& kernel_name, const
}
void runtime::intelgpu::do_dot_operation(cldnn::topology& topology,
const string& inputA_name,
const Shape& inputA_shape,
const string& inputB_name,
const Shape& inputB_shape,
const string& input0_name,
const Shape& input0_shape,
const string& input1_name,
const Shape& input1_shape,
const string& output_name,
const Shape& output_shape,
const element::Type& output_type)
......@@ -635,52 +662,53 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology,
codegen::CodeWriter writer;
vector<size_t> gws = {1};
const bool A_is_scalar = inputA_shape.empty();
const bool B_is_scalar = inputB_shape.empty();
const bool Z_is_scalar = output_shape.empty();
const bool is_input0_scalar = input0_shape.empty();
const bool is_input1_scalar = input1_shape.empty();
const bool is_output_scalar = output_shape.empty();
if (A_is_scalar && B_is_scalar && Z_is_scalar)
if (is_input0_scalar && is_input1_scalar && is_output_scalar)
{
do_scalar_scalar_mul(writer, entry_point_name);
}
else if (((A_is_scalar && !B_is_scalar) || (!A_is_scalar && B_is_scalar)) && !Z_is_scalar)
else if (((is_input0_scalar && !is_input1_scalar) || (!is_input0_scalar && is_input1_scalar)) &&
!is_output_scalar)
{
do_1d_scalar_mul(writer, entry_point_name, inputA_shape, inputB_shape);
do_1d_scalar_mul(writer, entry_point_name, input0_shape, input1_shape);
}
else if (!A_is_scalar && !B_is_scalar && Z_is_scalar)
else if (!is_input0_scalar && !is_input1_scalar && is_output_scalar)
{
do_1d_1d_mul(writer, entry_point_name, inputB_shape);
do_1d_1d_mul(writer, entry_point_name, input1_shape);
}
else if (!A_is_scalar && !B_is_scalar && !Z_is_scalar)
else if (!is_input0_scalar && !is_input1_scalar && !is_output_scalar)
{
if (inputA_shape.size() == 2 && inputB_shape.size() == 1)
if (input0_shape.size() == 2 && input1_shape.size() == 1)
{
gws = do_2d_1d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape);
gws = do_2d_1d_mul(writer, entry_point_name, input0_shape, input1_shape, output_shape);
}
else if (inputA_shape.size() == 2 && inputB_shape.size() == 2)
else if (input0_shape.size() == 2 && input1_shape.size() == 2)
{
gws = do_2d_2d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape);
gws = do_2d_2d_mul(writer, entry_point_name, input0_shape, input1_shape, output_shape);
}
else if (inputA_shape.size() == 3 && inputB_shape.size() == 3)
else if (input0_shape.size() == 3 && input1_shape.size() == 3)
{
gws = do_3d_3d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape);
gws = do_3d_3d_mul(writer, entry_point_name, input0_shape, input1_shape, output_shape);
}
else if (inputA_shape.size() == 3 && inputB_shape.size() == 2)
else if (input0_shape.size() == 3 && input1_shape.size() == 2)
{
gws = do_3d_2d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape);
gws = do_3d_2d_mul(writer, entry_point_name, input0_shape, input1_shape, output_shape);
}
else
{
do_dot_operation_error(inputA_shape, inputB_shape, output_shape);
do_dot_operation_error(input0_shape, input1_shape, output_shape);
}
}
else
{
do_dot_operation_error(inputA_shape, inputB_shape, output_shape);
do_dot_operation_error(input0_shape, input1_shape, output_shape);
}
const cldnn::custom_gpu_primitive op_dot(output_name,
{inputA_name, inputB_name},
{input0_name, input1_name},
{writer.get_code()},
entry_point_name,
get_kernel_args(2, 1),
......@@ -703,30 +731,21 @@ void runtime::intelgpu::do_slice_operation(cldnn::topology& topology,
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const string entry_point_name = "slice_" + output_name;
codegen::CodeWriter writer;
vector<size_t> gws;
runtime::intelgpu::gen_func_def(
writer, entry_point_name, {"float"}, {input_shape}, "float", output_shape);
writer << "__kernel void " << entry_point_name << "(const __global float input"
<< array_dims(input_shape) << ", __global float output" << array_dims(output_shape)
<< ")\n";
writer.block_begin();
{
size_t var_idx = 0;
// Main loops
for (auto const& i : output_shape)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
writer << "output" << access_dims(output_shape) << " = input"
writer << "output" << access_dims(output_shape) << " = input0"
<< access_dims_strided(input_shape, lower_bounds, strides, false) << ";\n";
// Closing brackets for main loops
for (auto const& i : output_shape)
{
writer.block_end();
}
runtime::intelgpu::generate_loops(writer, output_shape, false);
}
writer.block_end();
......@@ -737,7 +756,7 @@ void runtime::intelgpu::do_slice_operation(cldnn::topology& topology,
get_kernel_args(1, 1),
"",
layout,
{1});
gws);
topology.add(op_slice);
}
......@@ -755,45 +774,26 @@ void runtime::intelgpu::do_select_operation(cldnn::topology& topology,
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const string entry_point_name = "select_" + output_name;
codegen::CodeWriter writer;
vector<size_t> gws;
writer << "__kernel void " << entry_point_name << "(const __global char input0"
<< array_dims(input0_shape) << ", const __global float input1"
<< array_dims(input1_shape) << ", const __global float input2"
<< array_dims(input2_shape) << ", __global float output" << array_dims(output_shape)
<< ")\n";
runtime::intelgpu::gen_func_def(writer,
entry_point_name,
{"char", "float", "float"},
{input0_shape, input1_shape, input2_shape},
"float",
output_shape);
writer.block_begin();
{
size_t var_idx = 0;
// Main loops
for (auto const& i : output_shape)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
writer << "if (input0" << access_dims(input0_shape) << " != 0)\n";
writer.block_begin();
{
writer << "output" << access_dims(output_shape) << " = input1"
<< access_dims(input1_shape) << ";\n";
}
writer.block_end();
writer << "else\n";
writer.block_begin();
{
writer << "output" << access_dims(output_shape) << " = input2"
<< access_dims(input2_shape) << ";\n";
}
writer.block_end();
writer << "output" << access_dims(output_shape) << " = input0" << access_dims(input0_shape)
<< " ? input1" << access_dims(input1_shape) << " : input2"
<< access_dims(input2_shape) << ";\n";
// Closing brackets for main loops
for (auto const& i : output_shape)
{
writer.block_end();
}
runtime::intelgpu::generate_loops(writer, output_shape, false);
}
writer.block_end();
......@@ -804,17 +804,17 @@ void runtime::intelgpu::do_select_operation(cldnn::topology& topology,
get_kernel_args(3, 1),
"",
layout,
{1});
gws);
topology.add(op_select);
}
void runtime::intelgpu::do_logic_kernel(cldnn::topology& topology,
const string& inputA_name,
const Shape& inputA_shape,
const string& inputA_type,
const string& inputB_name,
const Shape& inputB_shape,
const string& inputB_type,
const string& input0_name,
const Shape& input0_shape,
const string& input0_type,
const string& input1_name,
const Shape& input1_shape,
const string& input1_type,
const string& output_name,
const Shape& output_shape,
const element::Type& output_type,
......@@ -823,55 +823,36 @@ void runtime::intelgpu::do_logic_kernel(cldnn::topology& topology,
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const string entry_point_name = "logic_" + output_name;
codegen::CodeWriter writer;
vector<size_t> gws;
writer << "__kernel void " << entry_point_name << "(const __global " << inputA_type << " inputA"
<< array_dims(inputA_shape) << ", const __global " << inputB_type << " inputB"
<< array_dims(inputB_shape) << ", __global char output" << array_dims(output_shape)
<< ")\n";
runtime::intelgpu::gen_func_def(writer,
entry_point_name,
{2, input0_type},
{input0_shape, input1_shape},
"char",
output_shape);
writer.block_begin();
{
size_t var_idx = 0;
// Main loops
for (auto const& i : output_shape)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
writer << "if (inputA" << access_dims(inputA_shape) << operation << "inputB"
<< access_dims(inputB_shape) << ")\n";
gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
writer.block_begin();
{
writer << "output" << access_dims(output_shape) << " = 1;\n";
}
writer.block_end();
writer << "else\n";
writer.block_begin();
{
writer << "output" << access_dims(output_shape) << " = 0;\n";
}
writer.block_end();
writer << "output" << access_dims(output_shape) << " = input0" << access_dims(input0_shape)
<< operation << "input1" << access_dims(input1_shape) << " ? 1 : 0;\n";
// Closing brackets for main loops
for (auto const& i : output_shape)
{
writer.block_end();
}
runtime::intelgpu::generate_loops(writer, output_shape, false);
}
writer.block_end();
const cldnn::custom_gpu_primitive op_logical(output_name,
{inputA_name, inputB_name},
{input0_name, input1_name},
{writer.get_code()},
entry_point_name,
get_kernel_args(2, 1),
"",
layout,
{1});
gws);
topology.add(op_logical);
}
......@@ -888,15 +869,14 @@ void runtime::intelgpu::do_reverse_operation(cldnn::topology& topology,
codegen::CodeWriter writer;
vector<size_t> gws;
writer << "__kernel void " << entry_point_name << "(const __global float input"
<< array_dims(input_shape) << ", __global float output" << array_dims(output_shape)
<< ")\n";
runtime::intelgpu::gen_func_def(
writer, entry_point_name, {"float"}, {input_shape}, "float", output_shape);
writer.block_begin();
{
gws = generate_loops(writer, output_shape, true);
writer << "output" << access_dims(output_shape) << " = input"
writer << "output" << access_dims(output_shape) << " = input0"
<< access_dims(output_shape, reversed_axes, true) << ";\n";
generate_loops(writer, output_shape, false);
......@@ -924,29 +904,19 @@ void runtime::intelgpu::do_not_operation(cldnn::topology& topology,
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const string entry_point_name = "logic_" + output_name;
codegen::CodeWriter writer;
vector<size_t> gws;
writer << "__kernel void " << entry_point_name << "(const __global char input"
<< array_dims(input_shape) << ", __global char output" << array_dims(output_shape)
<< ")\n";
runtime::intelgpu::gen_func_def(
writer, entry_point_name, {"char"}, {input_shape}, "char", output_shape);
writer.block_begin();
{
size_t var_idx = 0;
for (auto const& i : output_shape)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
writer << "output" << access_dims(output_shape) << " = !input" << access_dims(input_shape)
writer << "output" << access_dims(output_shape) << " = !input0" << access_dims(input_shape)
<< ";\n";
for (auto const& i : output_shape)
{
writer.block_end();
}
runtime::intelgpu::generate_loops(writer, output_shape, false);
}
writer.block_end();
......@@ -957,7 +927,7 @@ void runtime::intelgpu::do_not_operation(cldnn::topology& topology,
get_kernel_args(1, 1),
"",
layout,
{1});
gws);
topology.add(op_not);
}
......@@ -973,25 +943,21 @@ void runtime::intelgpu::do_one_hot_operation(cldnn::topology& topology,
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const string entry_point_name = "one_hot_" + output_name;
codegen::CodeWriter writer;
vector<size_t> gws;
writer << "__kernel void " << entry_point_name << "(const __global "
<< input_type.c_type_string() << " input" << array_dims(input_shape) << ", __global "
<< output_type.c_type_string() << " output" << array_dims(output_shape) << ")\n";
runtime::intelgpu::gen_func_def(writer,
entry_point_name,
{input_type.c_type_string()},
{input_shape},
output_type.c_type_string(),
output_shape);
writer.block_begin();
{
size_t var_idx = 0;
writer << "for (uint i = 0; i < " << output_shape.at(one_hot_axis) << "; ++i)\n";
writer.block_begin();
{
for (auto const& i : input_shape)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
gws = runtime::intelgpu::generate_loops(writer, input_shape, true);
size_t current_input = 0;
string buffer;
......@@ -1009,13 +975,10 @@ void runtime::intelgpu::do_one_hot_operation(cldnn::topology& topology,
}
}
writer << "output" << buffer << " = input" << access_dims(input_shape)
writer << "output" << buffer << " = input0" << access_dims(input_shape)
<< " == i ? 1 : 0;\n";
for (auto const& i : input_shape)
{
writer.block_end();
}
runtime::intelgpu::generate_loops(writer, input_shape, false);
}
writer.block_end();
}
......@@ -1028,7 +991,7 @@ void runtime::intelgpu::do_one_hot_operation(cldnn::topology& topology,
get_kernel_args(1, 1),
"",
layout,
{1});
gws);
topology.add(op_one_hot);
}
......@@ -1047,16 +1010,15 @@ void runtime::intelgpu::do_convert_operation(cldnn::topology& topology,
codegen::CodeWriter writer;
vector<size_t> gws;
writer << "__kernel void " << entry_point_name << "(const __global " << input_type_name
<< " input" << array_dims(input_shape) << ", __global " << output_type_name << " output"
<< array_dims(output_shape) << ")\n";
runtime::intelgpu::gen_func_def(
writer, entry_point_name, {input_type_name}, {input_shape}, output_type_name, output_shape);
writer.block_begin();
{
gws = generate_loops(writer, output_shape, true);
writer << "output" << access_dims(output_shape) << " = convert_" << output_type_name
<< "(input" << access_dims(output_shape) << ");\n";
<< "(input0" << access_dims(output_shape) << ");\n";
generate_loops(writer, output_shape, false);
}
......
......@@ -136,6 +136,12 @@ namespace ngraph
bool is_reversed = false);
std::vector<size_t>
generate_loops(codegen::CodeWriter& writer, const Shape& shape, bool is_begin);
void gen_func_def(codegen::CodeWriter& writer,
const std::string& entry_point_name,
const std::vector<std::string>& input_types,
const std::vector<Shape>& input_shapes,
const std::string& output_type,
const Shape& output_shape);
}
}
}
......@@ -21,12 +21,9 @@ backwards_exp
backwards_floor
backwards_maxpool_n2_c1_hw5_3x3_str2_max
backwards_maxpool_n4_c1_hw4_2x2_max
backwards_minimum
backwards_replace_slice
backwards_reverse_sequence_n3_c2_h3
backwards_reverse_sequence_n4d2c3h2w2
backwards_select
backwards_select_nested
backwards_sigmoid
backwards_sign
backwards_slice
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment