Commit f9368d45 authored by Anna Alberska's avatar Anna Alberska Committed by Robert Kimball

IntelGPU backend: Code refactoring and optimization (#1434)

* IntelGPU backend: Code refactoring and optimization

* Update gen_func_def()
parent d473eda9
...@@ -92,6 +92,28 @@ string ...@@ -92,6 +92,28 @@ string
return buffer; return buffer;
} }
void runtime::intelgpu::gen_func_def(codegen::CodeWriter& writer,
const string& entry_point_name,
const vector<string>& input_types,
const vector<Shape>& input_shapes,
const string& output_type,
const Shape& output_shape)
{
writer << "__kernel void " << entry_point_name << "(";
const size_t inputs_number = input_types.size();
for (uint i = 0; i < inputs_number; ++i)
{
if (i > 0)
{
writer << ", ";
}
writer << "const __global " << input_types.at(i) << " input" << i
<< array_dims(input_shapes.at(i));
}
writer << ", __global " << output_type << " output" << array_dims(output_shape) << ")\n";
}
vector<size_t> runtime::intelgpu::generate_loops(codegen::CodeWriter& writer, vector<size_t> runtime::intelgpu::generate_loops(codegen::CodeWriter& writer,
const Shape& shape, const Shape& shape,
bool is_begin) bool is_begin)
...@@ -156,12 +178,14 @@ static string access_dims_strided(const Shape& dimentions, ...@@ -156,12 +178,14 @@ static string access_dims_strided(const Shape& dimentions,
return buffer; return buffer;
} }
static void do_dot_operation_error(const Shape& shapeA, const Shape& shapeB, const Shape& shapeZ) static void do_dot_operation_error(const Shape& input0_shape,
const Shape& input1_shape,
const Shape& output_shape)
{ {
throw invalid_argument("IntelGPU Dot operation. Conbination ShapeA" + throw invalid_argument("IntelGPU Dot operation. Conbination input0_shape" +
runtime::intelgpu::array_dims(shapeA) + ", ShapeB" + runtime::intelgpu::array_dims(input0_shape) + ", input1_shape" +
runtime::intelgpu::array_dims(shapeB) + ", ShapeOutput" + runtime::intelgpu::array_dims(input1_shape) + ", output_shape" +
runtime::intelgpu::array_dims(shapeZ) + " is not supported."); runtime::intelgpu::array_dims(output_shape) + " is not supported.");
} }
void runtime::intelgpu::do_pad_operation(cldnn::topology& topology, void runtime::intelgpu::do_pad_operation(cldnn::topology& topology,
...@@ -179,16 +203,15 @@ void runtime::intelgpu::do_pad_operation(cldnn::topology& topology, ...@@ -179,16 +203,15 @@ void runtime::intelgpu::do_pad_operation(cldnn::topology& topology,
vector<size_t> gws; vector<size_t> gws;
// The kernel name and parameters // The kernel name and parameters
writer << "__kernel void " << entry_point_name << "(const __global float input" runtime::intelgpu::gen_func_def(
<< array_dims(input_shape) << ", const __global float scalar[1], __global float output" writer, entry_point_name, {2, "float"}, {input_shape, {1}}, "float", output_shape);
<< array_dims(output_shape) << ")\n";
writer.block_begin(); writer.block_begin();
{ {
// Loop for Broadcast scalar over full output tensor // Loop for Broadcast scalar over full output tensor
gws = runtime::intelgpu::generate_loops(writer, output_shape, true); gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
writer << "output" << access_dims(output_shape) << " = scalar[0];\n"; writer << "output" << access_dims(output_shape) << " = input1[0];\n";
// Closing brackets for Broadcast loop // Closing brackets for Broadcast loop
runtime::intelgpu::generate_loops(writer, output_shape, false); runtime::intelgpu::generate_loops(writer, output_shape, false);
...@@ -205,7 +228,7 @@ void runtime::intelgpu::do_pad_operation(cldnn::topology& topology, ...@@ -205,7 +228,7 @@ void runtime::intelgpu::do_pad_operation(cldnn::topology& topology,
} }
writer << "output" << access_dims_strided(input_shape, pad_below, pad_interior, true) writer << "output" << access_dims_strided(input_shape, pad_below, pad_interior, true)
<< " = input" << access_dims(input_shape) << ";\n"; << " = input0" << access_dims(input_shape) << ";\n";
// Closing brackets for main Copy loop // Closing brackets for main Copy loop
for (auto const& i : input_shape) for (auto const& i : input_shape)
...@@ -247,9 +270,8 @@ void runtime::intelgpu::do_max_pool_backprop_operation(cldnn::topology& topology ...@@ -247,9 +270,8 @@ void runtime::intelgpu::do_max_pool_backprop_operation(cldnn::topology& topology
vector<size_t> gws; vector<size_t> gws;
// The kernel name and parameters // The kernel name and parameters
writer << "__kernel void " << entry_point_name << "(const __global float input" runtime::intelgpu::gen_func_def(
<< array_dims(input_shape) << ", const __global float delta" << array_dims(delta_shape) writer, entry_point_name, {2, "float"}, {input_shape, delta_shape}, "float", output_shape);
<< ", __global float output" << array_dims(output_shape) << ")\n";
writer.block_begin(); writer.block_begin();
{ {
...@@ -335,7 +357,7 @@ void runtime::intelgpu::do_max_pool_backprop_operation(cldnn::topology& topology ...@@ -335,7 +357,7 @@ void runtime::intelgpu::do_max_pool_backprop_operation(cldnn::topology& topology
writer << ")\n"; writer << ")\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "const float max_local = input[i0][i1]"; writer << "const float max_local = input0[i0][i1]";
// additional dimensions for input // additional dimensions for input
for (size_t i = 0; i < win_shape.size(); ++i) for (size_t i = 0; i < win_shape.size(); ++i)
{ {
...@@ -376,7 +398,7 @@ void runtime::intelgpu::do_max_pool_backprop_operation(cldnn::topology& topology ...@@ -376,7 +398,7 @@ void runtime::intelgpu::do_max_pool_backprop_operation(cldnn::topology& topology
{ {
writer << "[save_i" << i + 2 << "]"; writer << "[save_i" << i + 2 << "]";
} }
writer << " += delta" << access_dims(delta_shape) << ";\n"; writer << " += input1" << access_dims(delta_shape) << ";\n";
} // End of elem_exists condition } // End of elem_exists condition
writer.block_end(); writer.block_end();
// Closing brackets for delta loop // Closing brackets for delta loop
...@@ -405,24 +427,24 @@ void runtime::intelgpu::do_max_pool_backprop_operation(cldnn::topology& topology ...@@ -405,24 +427,24 @@ void runtime::intelgpu::do_max_pool_backprop_operation(cldnn::topology& topology
} }
static void do_1d_scalar_mul(codegen::CodeWriter& writer, static void do_1d_scalar_mul(codegen::CodeWriter& writer,
string& kernel_name, string& entry_point_name,
const Shape& shapeA, const Shape& input0_shape,
const Shape& shapeB) const Shape& input1_shape)
{ {
const size_t countA = shapeA.empty() ? 0 : shape_size<Shape>(shapeA); const size_t input0_count = input0_shape.empty() ? 0 : shape_size<Shape>(input0_shape);
const size_t countB = shapeB.empty() ? 0 : shape_size<Shape>(shapeB); const size_t input1_count = input1_shape.empty() ? 0 : shape_size<Shape>(input1_shape);
const size_t countZ = max(countA, countB); const size_t output_count = max(input0_count, input1_count);
kernel_name += "_do_1d_scalar_mul"; entry_point_name += "_do_1d_scalar_mul";
writer << "__kernel void " << kernel_name << "(const __global float* inputA" writer << "__kernel void " << entry_point_name << "(const __global float* input0"
<< ", const __global float* inputB, __global float* output)\n"; << ", const __global float* input1, __global float* output)\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "for (uint i1 = 0; i1 < " << countZ << "; ++i1)\n"; writer << "for (uint i1 = 0; i1 < " << output_count << "; ++i1)\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "output[i1] = inputA[" << (countA > 0 ? "i1" : "0") << "] * inputB[" writer << "output[i1] = input0[" << (input0_count > 0 ? "i1" : "0") << "] * input1["
<< (countB > 0 ? "i1" : "0") << "];\n"; << (input1_count > 0 ? "i1" : "0") << "];\n";
} }
writer.block_end(); writer.block_end();
} }
...@@ -430,37 +452,38 @@ static void do_1d_scalar_mul(codegen::CodeWriter& writer, ...@@ -430,37 +452,38 @@ static void do_1d_scalar_mul(codegen::CodeWriter& writer,
} }
static vector<size_t> do_2d_2d_mul(codegen::CodeWriter& writer, static vector<size_t> do_2d_2d_mul(codegen::CodeWriter& writer,
string& kernel_name, string& entry_point_name,
const Shape& shapeA, const Shape& input0_shape,
const Shape& shapeB, const Shape& input1_shape,
const Shape& shapeZ) const Shape& output_shape)
{ {
const size_t colrow = shapeA.at(1); entry_point_name += "_do_2d_2d_mul";
kernel_name += "_do_2d_2d_mul";
vector<size_t> gws; vector<size_t> gws;
writer << "__kernel void " << kernel_name << "(const __global float inputA" runtime::intelgpu::gen_func_def(writer,
<< runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB" entry_point_name,
<< runtime::intelgpu::array_dims(shapeB) << ", __global float output" {2, "float"},
<< runtime::intelgpu::array_dims(shapeZ) << ")\n"; {input0_shape, input1_shape},
"float",
output_shape);
writer.block_begin(); writer.block_begin();
{ {
size_t var_idx = 0;
// Main loops // Main loops
gws = runtime::intelgpu::generate_loops(writer, shapeZ, true); gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
// Inner loop // Inner loop
writer << "float sum = 0.0f;\n"; writer << "float sum = 0.0f;\n";
writer << "for (uint i2 = 0; i2 < " << colrow << "; ++i2)\n"; writer << "for (uint i2 = 0; i2 < " << input0_shape.at(1) << "; ++i2)\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "sum += inputA[i0][i2] * inputB[i2][i1];\n"; writer << "sum += input0[i0][i2] * input1[i2][i1];\n";
} }
writer.block_end(); writer.block_end();
writer << "output[i0][i1] = sum;\n"; writer << "output[i0][i1] = sum;\n";
// Closing brackets for main loops // Closing brackets for main loops
runtime::intelgpu::generate_loops(writer, shapeZ, false); runtime::intelgpu::generate_loops(writer, output_shape, false);
} }
writer.block_end(); writer.block_end();
...@@ -468,37 +491,38 @@ static vector<size_t> do_2d_2d_mul(codegen::CodeWriter& writer, ...@@ -468,37 +491,38 @@ static vector<size_t> do_2d_2d_mul(codegen::CodeWriter& writer,
} }
static vector<size_t> do_3d_3d_mul(codegen::CodeWriter& writer, static vector<size_t> do_3d_3d_mul(codegen::CodeWriter& writer,
string& kernel_name, string& entry_point_name,
const Shape& shapeA, const Shape& input0_shape,
const Shape& shapeB, const Shape& input1_shape,
const Shape& shapeZ) const Shape& output_shape)
{ {
const size_t colrow = shapeA.back(); entry_point_name += "_do_3d_3d_mul";
kernel_name += "_do_3d_3d_mul";
vector<size_t> gws; vector<size_t> gws;
writer << "__kernel void " << kernel_name << "(const __global float inputA" runtime::intelgpu::gen_func_def(writer,
<< runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB" entry_point_name,
<< runtime::intelgpu::array_dims(shapeB) << ", __global float output" {2, "float"},
<< runtime::intelgpu::array_dims(shapeZ) << ")\n"; {input0_shape, input1_shape},
"float",
output_shape);
writer.block_begin(); writer.block_begin();
{ {
size_t var_idx = 0;
// Main loops // Main loops
gws = runtime::intelgpu::generate_loops(writer, shapeZ, true); gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
// Inner loop // Inner loop
writer << "float sum = 0.0f;\n"; writer << "float sum = 0.0f;\n";
writer << "for (uint i4 = 0; i4 < " << colrow << "; ++i4)\n"; writer << "for (uint i4 = 0; i4 < " << input0_shape.back() << "; ++i4)\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "sum += inputA[i0][i1][i4] * inputB[i4][i2][i3];\n"; writer << "sum += input0[i0][i1][i4] * input1[i4][i2][i3];\n";
} }
writer.block_end(); writer.block_end();
writer << "output[i0][i1][i2][i3] = sum;\n"; writer << "output[i0][i1][i2][i3] = sum;\n";
// Closing brackets for main loops // Closing brackets for main loops
runtime::intelgpu::generate_loops(writer, shapeZ, false); runtime::intelgpu::generate_loops(writer, output_shape, false);
} }
writer.block_end(); writer.block_end();
...@@ -506,37 +530,38 @@ static vector<size_t> do_3d_3d_mul(codegen::CodeWriter& writer, ...@@ -506,37 +530,38 @@ static vector<size_t> do_3d_3d_mul(codegen::CodeWriter& writer,
} }
static vector<size_t> do_3d_2d_mul(codegen::CodeWriter& writer, static vector<size_t> do_3d_2d_mul(codegen::CodeWriter& writer,
string& kernel_name, string& entry_point_name,
const Shape& shapeA, const Shape& input0_shape,
const Shape& shapeB, const Shape& input1_shape,
const Shape& shapeZ) const Shape& output_shape)
{ {
const size_t colrow = shapeA.back(); entry_point_name += "_do_3d_2d_mul";
kernel_name += "_do_3d_2d_mul";
vector<size_t> gws; vector<size_t> gws;
writer << "__kernel void " << kernel_name << "(const __global float inputA" runtime::intelgpu::gen_func_def(writer,
<< runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB" entry_point_name,
<< runtime::intelgpu::array_dims(shapeB) << ", __global float output" {2, "float"},
<< runtime::intelgpu::array_dims(shapeZ) << ")\n"; {input0_shape, input1_shape},
"float",
output_shape);
writer.block_begin(); writer.block_begin();
{ {
size_t var_idx = 0;
// Main loops // Main loops
gws = runtime::intelgpu::generate_loops(writer, shapeZ, true); gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
// Inner loop // Inner loop
writer << "float sum = 0.0f;\n"; writer << "float sum = 0.0f;\n";
writer << "for (uint i3 = 0; i3 < " << colrow << "; ++i3)\n"; writer << "for (uint i3 = 0; i3 < " << input0_shape.back() << "; ++i3)\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "sum += inputA[i0][i1][i3] * inputB[i3][i2];\n"; writer << "sum += input0[i0][i1][i3] * input1[i3][i2];\n";
} }
writer.block_end(); writer.block_end();
writer << "output[i0][i1][i2] = sum;\n"; writer << "output[i0][i1][i2] = sum;\n";
// Closing brackets for main loops // Closing brackets for main loops
runtime::intelgpu::generate_loops(writer, shapeZ, false); runtime::intelgpu::generate_loops(writer, output_shape, false);
} }
writer.block_end(); writer.block_end();
...@@ -544,56 +569,59 @@ static vector<size_t> do_3d_2d_mul(codegen::CodeWriter& writer, ...@@ -544,56 +569,59 @@ static vector<size_t> do_3d_2d_mul(codegen::CodeWriter& writer,
} }
static vector<size_t> do_2d_1d_mul(codegen::CodeWriter& writer, static vector<size_t> do_2d_1d_mul(codegen::CodeWriter& writer,
string& kernel_name, string& entry_point_name,
const Shape& shapeA, const Shape& input0_shape,
const Shape& shapeB, const Shape& input1_shape,
const Shape& shapeZ) const Shape& output_shape)
{ {
const size_t colrow = shapeA.at(1); entry_point_name += "_do_2d_1d_mul";
kernel_name += "_do_2d_1d_mul";
vector<size_t> gws; vector<size_t> gws;
writer << "__kernel void " << kernel_name << "(const __global float inputA" runtime::intelgpu::gen_func_def(writer,
<< runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB" entry_point_name,
<< runtime::intelgpu::array_dims(shapeB) << ", __global float output" {2, "float"},
<< runtime::intelgpu::array_dims(shapeZ) << ")\n"; {input0_shape, input1_shape},
"float",
output_shape);
writer.block_begin(); writer.block_begin();
{ {
// Main loops // Main loops
gws = runtime::intelgpu::generate_loops(writer, shapeZ, true); gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
writer << "float sum = 0.0f;\n"; writer << "float sum = 0.0f;\n";
// Inner loop // Inner loop
writer << "for (uint i1 = 0; i1 < " << colrow << "; ++i1)\n"; writer << "for (uint i1 = 0; i1 < " << input0_shape.at(1) << "; ++i1)\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "sum += inputA[i0][i1] * inputB[i1];\n"; writer << "sum += input0[i0][i1] * input1[i1];\n";
} }
writer.block_end(); writer.block_end();
writer << "output[i0] = sum;\n"; writer << "output[i0] = sum;\n";
// Closing brackets for main loops // Closing brackets for main loops
runtime::intelgpu::generate_loops(writer, shapeZ, false); runtime::intelgpu::generate_loops(writer, output_shape, false);
} }
writer.block_end(); writer.block_end();
return gws; return gws;
} }
static void do_scalar_scalar_mul(codegen::CodeWriter& writer, string& kernel_name) static void do_scalar_scalar_mul(codegen::CodeWriter& writer, string& entry_point_name)
{ {
kernel_name += "_scalar_scalar_mul"; entry_point_name += "_scalar_scalar_mul";
runtime::intelgpu::gen_func_def(
writer, entry_point_name, {2, "float"}, {{1}, {1}}, "float", {1});
writer << "__kernel void " << kernel_name << "(const __global float inputA[1]"
<< ", const __global float inputB[1], __global float output[1])\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "output[0] = inputA[0] * inputB[0];\n"; writer << "output[0] = input0[0] * input1[0];\n";
} }
writer.block_end(); writer.block_end();
} }
static void do_1d_1d_mul(codegen::CodeWriter& writer, string& kernel_name, const Shape& shape) static void do_1d_1d_mul(codegen::CodeWriter& writer, string& entry_point_name, const Shape& shape)
{ {
if (shape.size() > 1) if (shape.size() > 1)
{ {
...@@ -601,19 +629,18 @@ static void do_1d_1d_mul(codegen::CodeWriter& writer, string& kernel_name, const ...@@ -601,19 +629,18 @@ static void do_1d_1d_mul(codegen::CodeWriter& writer, string& kernel_name, const
" must be 1D"); " must be 1D");
} }
const size_t& size = shape.front(); entry_point_name += "_do_1d_1d_mul";
kernel_name += "_do_1d_1d_mul";
runtime::intelgpu::gen_func_def(
writer, entry_point_name, {2, "float"}, {2, shape}, "float", {1});
writer << "__kernel void " << kernel_name << "(const __global float inputA"
<< runtime::intelgpu::array_dims(shape) << ", const __global float inputB"
<< runtime::intelgpu::array_dims(shape) << ", __global float output[1])\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "float sum = 0.0f;\n" writer << "float sum = 0.0f;\n"
<< "for (uint i = 0; i < " << size << "; ++i)\n"; << "for (uint i = 0; i < " << shape.front() << "; ++i)\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "sum += inputA[i] * inputB[i];\n"; writer << "sum += input0[i] * input1[i];\n";
} }
writer.block_end(); writer.block_end();
writer << "output[0] = sum;\n"; writer << "output[0] = sum;\n";
...@@ -622,10 +649,10 @@ static void do_1d_1d_mul(codegen::CodeWriter& writer, string& kernel_name, const ...@@ -622,10 +649,10 @@ static void do_1d_1d_mul(codegen::CodeWriter& writer, string& kernel_name, const
} }
void runtime::intelgpu::do_dot_operation(cldnn::topology& topology, void runtime::intelgpu::do_dot_operation(cldnn::topology& topology,
const string& inputA_name, const string& input0_name,
const Shape& inputA_shape, const Shape& input0_shape,
const string& inputB_name, const string& input1_name,
const Shape& inputB_shape, const Shape& input1_shape,
const string& output_name, const string& output_name,
const Shape& output_shape, const Shape& output_shape,
const element::Type& output_type) const element::Type& output_type)
...@@ -635,52 +662,53 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology, ...@@ -635,52 +662,53 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology,
codegen::CodeWriter writer; codegen::CodeWriter writer;
vector<size_t> gws = {1}; vector<size_t> gws = {1};
const bool A_is_scalar = inputA_shape.empty(); const bool is_input0_scalar = input0_shape.empty();
const bool B_is_scalar = inputB_shape.empty(); const bool is_input1_scalar = input1_shape.empty();
const bool Z_is_scalar = output_shape.empty(); const bool is_output_scalar = output_shape.empty();
if (A_is_scalar && B_is_scalar && Z_is_scalar) if (is_input0_scalar && is_input1_scalar && is_output_scalar)
{ {
do_scalar_scalar_mul(writer, entry_point_name); do_scalar_scalar_mul(writer, entry_point_name);
} }
else if (((A_is_scalar && !B_is_scalar) || (!A_is_scalar && B_is_scalar)) && !Z_is_scalar) else if (((is_input0_scalar && !is_input1_scalar) || (!is_input0_scalar && is_input1_scalar)) &&
!is_output_scalar)
{ {
do_1d_scalar_mul(writer, entry_point_name, inputA_shape, inputB_shape); do_1d_scalar_mul(writer, entry_point_name, input0_shape, input1_shape);
} }
else if (!A_is_scalar && !B_is_scalar && Z_is_scalar) else if (!is_input0_scalar && !is_input1_scalar && is_output_scalar)
{ {
do_1d_1d_mul(writer, entry_point_name, inputB_shape); do_1d_1d_mul(writer, entry_point_name, input1_shape);
} }
else if (!A_is_scalar && !B_is_scalar && !Z_is_scalar) else if (!is_input0_scalar && !is_input1_scalar && !is_output_scalar)
{ {
if (inputA_shape.size() == 2 && inputB_shape.size() == 1) if (input0_shape.size() == 2 && input1_shape.size() == 1)
{ {
gws = do_2d_1d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape); gws = do_2d_1d_mul(writer, entry_point_name, input0_shape, input1_shape, output_shape);
} }
else if (inputA_shape.size() == 2 && inputB_shape.size() == 2) else if (input0_shape.size() == 2 && input1_shape.size() == 2)
{ {
gws = do_2d_2d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape); gws = do_2d_2d_mul(writer, entry_point_name, input0_shape, input1_shape, output_shape);
} }
else if (inputA_shape.size() == 3 && inputB_shape.size() == 3) else if (input0_shape.size() == 3 && input1_shape.size() == 3)
{ {
gws = do_3d_3d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape); gws = do_3d_3d_mul(writer, entry_point_name, input0_shape, input1_shape, output_shape);
} }
else if (inputA_shape.size() == 3 && inputB_shape.size() == 2) else if (input0_shape.size() == 3 && input1_shape.size() == 2)
{ {
gws = do_3d_2d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape); gws = do_3d_2d_mul(writer, entry_point_name, input0_shape, input1_shape, output_shape);
} }
else else
{ {
do_dot_operation_error(inputA_shape, inputB_shape, output_shape); do_dot_operation_error(input0_shape, input1_shape, output_shape);
} }
} }
else else
{ {
do_dot_operation_error(inputA_shape, inputB_shape, output_shape); do_dot_operation_error(input0_shape, input1_shape, output_shape);
} }
const cldnn::custom_gpu_primitive op_dot(output_name, const cldnn::custom_gpu_primitive op_dot(output_name,
{inputA_name, inputB_name}, {input0_name, input1_name},
{writer.get_code()}, {writer.get_code()},
entry_point_name, entry_point_name,
get_kernel_args(2, 1), get_kernel_args(2, 1),
...@@ -703,30 +731,21 @@ void runtime::intelgpu::do_slice_operation(cldnn::topology& topology, ...@@ -703,30 +731,21 @@ void runtime::intelgpu::do_slice_operation(cldnn::topology& topology,
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape); const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const string entry_point_name = "slice_" + output_name; const string entry_point_name = "slice_" + output_name;
codegen::CodeWriter writer; codegen::CodeWriter writer;
vector<size_t> gws;
runtime::intelgpu::gen_func_def(
writer, entry_point_name, {"float"}, {input_shape}, "float", output_shape);
writer << "__kernel void " << entry_point_name << "(const __global float input"
<< array_dims(input_shape) << ", __global float output" << array_dims(output_shape)
<< ")\n";
writer.block_begin(); writer.block_begin();
{ {
size_t var_idx = 0;
// Main loops // Main loops
for (auto const& i : output_shape) gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
writer << "output" << access_dims(output_shape) << " = input" writer << "output" << access_dims(output_shape) << " = input0"
<< access_dims_strided(input_shape, lower_bounds, strides, false) << ";\n"; << access_dims_strided(input_shape, lower_bounds, strides, false) << ";\n";
// Closing brackets for main loops // Closing brackets for main loops
for (auto const& i : output_shape) runtime::intelgpu::generate_loops(writer, output_shape, false);
{
writer.block_end();
}
} }
writer.block_end(); writer.block_end();
...@@ -737,7 +756,7 @@ void runtime::intelgpu::do_slice_operation(cldnn::topology& topology, ...@@ -737,7 +756,7 @@ void runtime::intelgpu::do_slice_operation(cldnn::topology& topology,
get_kernel_args(1, 1), get_kernel_args(1, 1),
"", "",
layout, layout,
{1}); gws);
topology.add(op_slice); topology.add(op_slice);
} }
...@@ -755,45 +774,26 @@ void runtime::intelgpu::do_select_operation(cldnn::topology& topology, ...@@ -755,45 +774,26 @@ void runtime::intelgpu::do_select_operation(cldnn::topology& topology,
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape); const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const string entry_point_name = "select_" + output_name; const string entry_point_name = "select_" + output_name;
codegen::CodeWriter writer; codegen::CodeWriter writer;
vector<size_t> gws;
writer << "__kernel void " << entry_point_name << "(const __global char input0" runtime::intelgpu::gen_func_def(writer,
<< array_dims(input0_shape) << ", const __global float input1" entry_point_name,
<< array_dims(input1_shape) << ", const __global float input2" {"char", "float", "float"},
<< array_dims(input2_shape) << ", __global float output" << array_dims(output_shape) {input0_shape, input1_shape, input2_shape},
<< ")\n"; "float",
output_shape);
writer.block_begin(); writer.block_begin();
{ {
size_t var_idx = 0;
// Main loops // Main loops
for (auto const& i : output_shape) gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
writer << "if (input0" << access_dims(input0_shape) << " != 0)\n"; writer << "output" << access_dims(output_shape) << " = input0" << access_dims(input0_shape)
writer.block_begin(); << " ? input1" << access_dims(input1_shape) << " : input2"
{ << access_dims(input2_shape) << ";\n";
writer << "output" << access_dims(output_shape) << " = input1"
<< access_dims(input1_shape) << ";\n";
}
writer.block_end();
writer << "else\n";
writer.block_begin();
{
writer << "output" << access_dims(output_shape) << " = input2"
<< access_dims(input2_shape) << ";\n";
}
writer.block_end();
// Closing brackets for main loops // Closing brackets for main loops
for (auto const& i : output_shape) runtime::intelgpu::generate_loops(writer, output_shape, false);
{
writer.block_end();
}
} }
writer.block_end(); writer.block_end();
...@@ -804,17 +804,17 @@ void runtime::intelgpu::do_select_operation(cldnn::topology& topology, ...@@ -804,17 +804,17 @@ void runtime::intelgpu::do_select_operation(cldnn::topology& topology,
get_kernel_args(3, 1), get_kernel_args(3, 1),
"", "",
layout, layout,
{1}); gws);
topology.add(op_select); topology.add(op_select);
} }
void runtime::intelgpu::do_logic_kernel(cldnn::topology& topology, void runtime::intelgpu::do_logic_kernel(cldnn::topology& topology,
const string& inputA_name, const string& input0_name,
const Shape& inputA_shape, const Shape& input0_shape,
const string& inputA_type, const string& input0_type,
const string& inputB_name, const string& input1_name,
const Shape& inputB_shape, const Shape& input1_shape,
const string& inputB_type, const string& input1_type,
const string& output_name, const string& output_name,
const Shape& output_shape, const Shape& output_shape,
const element::Type& output_type, const element::Type& output_type,
...@@ -823,55 +823,36 @@ void runtime::intelgpu::do_logic_kernel(cldnn::topology& topology, ...@@ -823,55 +823,36 @@ void runtime::intelgpu::do_logic_kernel(cldnn::topology& topology,
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape); const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const string entry_point_name = "logic_" + output_name; const string entry_point_name = "logic_" + output_name;
codegen::CodeWriter writer; codegen::CodeWriter writer;
vector<size_t> gws;
writer << "__kernel void " << entry_point_name << "(const __global " << inputA_type << " inputA" runtime::intelgpu::gen_func_def(writer,
<< array_dims(inputA_shape) << ", const __global " << inputB_type << " inputB" entry_point_name,
<< array_dims(inputB_shape) << ", __global char output" << array_dims(output_shape) {2, input0_type},
<< ")\n"; {input0_shape, input1_shape},
"char",
output_shape);
writer.block_begin(); writer.block_begin();
{ {
size_t var_idx = 0;
// Main loops // Main loops
for (auto const& i : output_shape) gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
writer << "if (inputA" << access_dims(inputA_shape) << operation << "inputB"
<< access_dims(inputB_shape) << ")\n";
writer.block_begin(); writer << "output" << access_dims(output_shape) << " = input0" << access_dims(input0_shape)
{ << operation << "input1" << access_dims(input1_shape) << " ? 1 : 0;\n";
writer << "output" << access_dims(output_shape) << " = 1;\n";
}
writer.block_end();
writer << "else\n";
writer.block_begin();
{
writer << "output" << access_dims(output_shape) << " = 0;\n";
}
writer.block_end();
// Closing brackets for main loops // Closing brackets for main loops
for (auto const& i : output_shape) runtime::intelgpu::generate_loops(writer, output_shape, false);
{
writer.block_end();
}
} }
writer.block_end(); writer.block_end();
const cldnn::custom_gpu_primitive op_logical(output_name, const cldnn::custom_gpu_primitive op_logical(output_name,
{inputA_name, inputB_name}, {input0_name, input1_name},
{writer.get_code()}, {writer.get_code()},
entry_point_name, entry_point_name,
get_kernel_args(2, 1), get_kernel_args(2, 1),
"", "",
layout, layout,
{1}); gws);
topology.add(op_logical); topology.add(op_logical);
} }
...@@ -888,15 +869,14 @@ void runtime::intelgpu::do_reverse_operation(cldnn::topology& topology, ...@@ -888,15 +869,14 @@ void runtime::intelgpu::do_reverse_operation(cldnn::topology& topology,
codegen::CodeWriter writer; codegen::CodeWriter writer;
vector<size_t> gws; vector<size_t> gws;
writer << "__kernel void " << entry_point_name << "(const __global float input" runtime::intelgpu::gen_func_def(
<< array_dims(input_shape) << ", __global float output" << array_dims(output_shape) writer, entry_point_name, {"float"}, {input_shape}, "float", output_shape);
<< ")\n";
writer.block_begin(); writer.block_begin();
{ {
gws = generate_loops(writer, output_shape, true); gws = generate_loops(writer, output_shape, true);
writer << "output" << access_dims(output_shape) << " = input" writer << "output" << access_dims(output_shape) << " = input0"
<< access_dims(output_shape, reversed_axes, true) << ";\n"; << access_dims(output_shape, reversed_axes, true) << ";\n";
generate_loops(writer, output_shape, false); generate_loops(writer, output_shape, false);
...@@ -924,29 +904,19 @@ void runtime::intelgpu::do_not_operation(cldnn::topology& topology, ...@@ -924,29 +904,19 @@ void runtime::intelgpu::do_not_operation(cldnn::topology& topology,
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape); const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const string entry_point_name = "logic_" + output_name; const string entry_point_name = "logic_" + output_name;
codegen::CodeWriter writer; codegen::CodeWriter writer;
vector<size_t> gws;
writer << "__kernel void " << entry_point_name << "(const __global char input" runtime::intelgpu::gen_func_def(
<< array_dims(input_shape) << ", __global char output" << array_dims(output_shape) writer, entry_point_name, {"char"}, {input_shape}, "char", output_shape);
<< ")\n";
writer.block_begin(); writer.block_begin();
{ {
size_t var_idx = 0; gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
for (auto const& i : output_shape)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
writer << "output" << access_dims(output_shape) << " = !input" << access_dims(input_shape) writer << "output" << access_dims(output_shape) << " = !input0" << access_dims(input_shape)
<< ";\n"; << ";\n";
for (auto const& i : output_shape) runtime::intelgpu::generate_loops(writer, output_shape, false);
{
writer.block_end();
}
} }
writer.block_end(); writer.block_end();
...@@ -957,7 +927,7 @@ void runtime::intelgpu::do_not_operation(cldnn::topology& topology, ...@@ -957,7 +927,7 @@ void runtime::intelgpu::do_not_operation(cldnn::topology& topology,
get_kernel_args(1, 1), get_kernel_args(1, 1),
"", "",
layout, layout,
{1}); gws);
topology.add(op_not); topology.add(op_not);
} }
...@@ -973,25 +943,21 @@ void runtime::intelgpu::do_one_hot_operation(cldnn::topology& topology, ...@@ -973,25 +943,21 @@ void runtime::intelgpu::do_one_hot_operation(cldnn::topology& topology,
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape); const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
const string entry_point_name = "one_hot_" + output_name; const string entry_point_name = "one_hot_" + output_name;
codegen::CodeWriter writer; codegen::CodeWriter writer;
vector<size_t> gws;
writer << "__kernel void " << entry_point_name << "(const __global " runtime::intelgpu::gen_func_def(writer,
<< input_type.c_type_string() << " input" << array_dims(input_shape) << ", __global " entry_point_name,
<< output_type.c_type_string() << " output" << array_dims(output_shape) << ")\n"; {input_type.c_type_string()},
{input_shape},
output_type.c_type_string(),
output_shape);
writer.block_begin(); writer.block_begin();
{ {
size_t var_idx = 0;
writer << "for (uint i = 0; i < " << output_shape.at(one_hot_axis) << "; ++i)\n"; writer << "for (uint i = 0; i < " << output_shape.at(one_hot_axis) << "; ++i)\n";
writer.block_begin(); writer.block_begin();
{ {
for (auto const& i : input_shape) gws = runtime::intelgpu::generate_loops(writer, input_shape, true);
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
size_t current_input = 0; size_t current_input = 0;
string buffer; string buffer;
...@@ -1009,13 +975,10 @@ void runtime::intelgpu::do_one_hot_operation(cldnn::topology& topology, ...@@ -1009,13 +975,10 @@ void runtime::intelgpu::do_one_hot_operation(cldnn::topology& topology,
} }
} }
writer << "output" << buffer << " = input" << access_dims(input_shape) writer << "output" << buffer << " = input0" << access_dims(input_shape)
<< " == i ? 1 : 0;\n"; << " == i ? 1 : 0;\n";
for (auto const& i : input_shape) runtime::intelgpu::generate_loops(writer, input_shape, false);
{
writer.block_end();
}
} }
writer.block_end(); writer.block_end();
} }
...@@ -1028,7 +991,7 @@ void runtime::intelgpu::do_one_hot_operation(cldnn::topology& topology, ...@@ -1028,7 +991,7 @@ void runtime::intelgpu::do_one_hot_operation(cldnn::topology& topology,
get_kernel_args(1, 1), get_kernel_args(1, 1),
"", "",
layout, layout,
{1}); gws);
topology.add(op_one_hot); topology.add(op_one_hot);
} }
...@@ -1047,16 +1010,15 @@ void runtime::intelgpu::do_convert_operation(cldnn::topology& topology, ...@@ -1047,16 +1010,15 @@ void runtime::intelgpu::do_convert_operation(cldnn::topology& topology,
codegen::CodeWriter writer; codegen::CodeWriter writer;
vector<size_t> gws; vector<size_t> gws;
writer << "__kernel void " << entry_point_name << "(const __global " << input_type_name runtime::intelgpu::gen_func_def(
<< " input" << array_dims(input_shape) << ", __global " << output_type_name << " output" writer, entry_point_name, {input_type_name}, {input_shape}, output_type_name, output_shape);
<< array_dims(output_shape) << ")\n";
writer.block_begin(); writer.block_begin();
{ {
gws = generate_loops(writer, output_shape, true); gws = generate_loops(writer, output_shape, true);
writer << "output" << access_dims(output_shape) << " = convert_" << output_type_name writer << "output" << access_dims(output_shape) << " = convert_" << output_type_name
<< "(input" << access_dims(output_shape) << ");\n"; << "(input0" << access_dims(output_shape) << ");\n";
generate_loops(writer, output_shape, false); generate_loops(writer, output_shape, false);
} }
......
...@@ -136,6 +136,12 @@ namespace ngraph ...@@ -136,6 +136,12 @@ namespace ngraph
bool is_reversed = false); bool is_reversed = false);
std::vector<size_t> std::vector<size_t>
generate_loops(codegen::CodeWriter& writer, const Shape& shape, bool is_begin); generate_loops(codegen::CodeWriter& writer, const Shape& shape, bool is_begin);
void gen_func_def(codegen::CodeWriter& writer,
const std::string& entry_point_name,
const std::vector<std::string>& input_types,
const std::vector<Shape>& input_shapes,
const std::string& output_type,
const Shape& output_shape);
} }
} }
} }
...@@ -21,12 +21,9 @@ backwards_exp ...@@ -21,12 +21,9 @@ backwards_exp
backwards_floor backwards_floor
backwards_maxpool_n2_c1_hw5_3x3_str2_max backwards_maxpool_n2_c1_hw5_3x3_str2_max
backwards_maxpool_n4_c1_hw4_2x2_max backwards_maxpool_n4_c1_hw4_2x2_max
backwards_minimum
backwards_replace_slice backwards_replace_slice
backwards_reverse_sequence_n3_c2_h3 backwards_reverse_sequence_n3_c2_h3
backwards_reverse_sequence_n4d2c3h2w2 backwards_reverse_sequence_n4d2c3h2w2
backwards_select
backwards_select_nested
backwards_sigmoid backwards_sigmoid
backwards_sign backwards_sign
backwards_slice backwards_slice
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment