Commit 8ab89b29 authored by shssf's avatar shssf Committed by Robert Kimball

IntelGPU backend: Code refactored. No algo changed. (#1328)

parent 2b26df18
...@@ -23,50 +23,13 @@ ...@@ -23,50 +23,13 @@
#include "ngraph/runtime/intelgpu/code_writer.hpp" #include "ngraph/runtime/intelgpu/code_writer.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp" #include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp" #include "ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
#include "ngraph/op/batch_norm.hpp" #include "ngraph/op/batch_norm.hpp"
using namespace std; using namespace std;
using namespace ngraph; using namespace ngraph;
static vector<cldnn_arg> parameters_1inp_1out = {{arg_input, 0}, {arg_output, 0}};
static vector<cldnn_arg> parameters_2inp_1out = {{arg_input, 0}, {arg_input, 1}, {arg_output, 0}};
static vector<cldnn_arg> parameters_5inp_1out = {{arg_input, 0},
{arg_input, 1},
{arg_input, 2},
{arg_input, 3},
{arg_input, 4},
{arg_output, 0}};
static string array_dims(const Shape& dimentions)
{
string buffer;
for (auto const& dim : dimentions)
{
buffer += "[" + to_string(dim) + "]";
}
return buffer;
}
static string access_dims(const Shape& dimentions, const AxisSet& axis = {})
{
size_t var_idx = 0;
string buffer;
for (auto const& i : dimentions)
{
if (axis.find(var_idx) == axis.end())
{
buffer += "[i" + to_string(var_idx) + "]";
}
++var_idx;
}
return buffer;
}
void runtime::intelgpu::do_create_mean(cldnn::topology& topology, void runtime::intelgpu::do_create_mean(cldnn::topology& topology,
const string& output_name, const string& output_name,
const Shape& output_shape, const Shape& output_shape,
...@@ -138,7 +101,7 @@ void runtime::intelgpu::do_create_mean(cldnn::topology& topology, ...@@ -138,7 +101,7 @@ void runtime::intelgpu::do_create_mean(cldnn::topology& topology,
{input_name}, {input_name},
{writer.get_code()}, {writer.get_code()},
entry_point_name, entry_point_name,
parameters_1inp_1out, get_kernel_args(1, 1),
"", "",
layout, layout,
{1}); {1});
...@@ -221,7 +184,7 @@ void runtime::intelgpu::do_create_variance(cldnn::topology& topology, ...@@ -221,7 +184,7 @@ void runtime::intelgpu::do_create_variance(cldnn::topology& topology,
{input_name, mean_name}, {input_name, mean_name},
{writer.get_code()}, {writer.get_code()},
entry_point_name, entry_point_name,
parameters_2inp_1out, get_kernel_args(2, 1),
"", "",
layout, layout,
{1}); {1});
...@@ -313,7 +276,7 @@ void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology, ...@@ -313,7 +276,7 @@ void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology,
inputs, inputs,
{writer.get_code()}, {writer.get_code()},
entry_point_name, entry_point_name,
parameters_5inp_1out, get_kernel_args(5, 1),
"", "",
layout, layout,
{1}); {1});
......
...@@ -21,43 +21,13 @@ ...@@ -21,43 +21,13 @@
#include "ngraph/runtime/intelgpu/code_writer.hpp" #include "ngraph/runtime/intelgpu/code_writer.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp" #include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp" #include "ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
#include "ngraph/util.hpp" #include "ngraph/util.hpp"
using namespace std; using namespace std;
using namespace ngraph; using namespace ngraph;
static vector<cldnn_arg> parameters_1inp_1out = {{arg_input, 0}, {arg_output, 0}};
static string array_dims(const Shape& dimentions)
{
string buffer;
for (auto const& dim : dimentions)
{
buffer += "[" + to_string(dim) + "]";
}
return buffer;
}
static string access_dims(const Shape& dimentions, const AxisSet& axis = {})
{
size_t var_idx = 0;
string buffer;
for (auto const& i : dimentions)
{
if (axis.find(var_idx) == axis.end())
{
buffer += "[i" + to_string(var_idx) + "]";
}
++var_idx;
}
return buffer;
}
void runtime::intelgpu::do_bcast_sum_operation_scalar(cldnn::topology& topology, void runtime::intelgpu::do_bcast_sum_operation_scalar(cldnn::topology& topology,
const string& input_name, const string& input_name,
const Shape& input_shape, const Shape& input_shape,
...@@ -66,7 +36,8 @@ void runtime::intelgpu::do_bcast_sum_operation_scalar(cldnn::topology& topology, ...@@ -66,7 +36,8 @@ void runtime::intelgpu::do_bcast_sum_operation_scalar(cldnn::topology& topology,
const element::Type& output_type, const element::Type& output_type,
bool is_bcast) bool is_bcast)
{ {
const string function_name = is_bcast ? "broadcast_scalar" : "sum_scalar"; string function_name = is_bcast ? "broadcast_scalar" : "sum_scalar";
function_name += output_name;
const size_t input_count = const size_t input_count =
is_bcast ? shape_size<Shape>(output_shape) : shape_size<Shape>(input_shape); is_bcast ? shape_size<Shape>(output_shape) : shape_size<Shape>(input_shape);
codegen::CodeWriter writer; codegen::CodeWriter writer;
...@@ -98,7 +69,7 @@ void runtime::intelgpu::do_bcast_sum_operation_scalar(cldnn::topology& topology, ...@@ -98,7 +69,7 @@ void runtime::intelgpu::do_bcast_sum_operation_scalar(cldnn::topology& topology,
{input_name}, {input_name},
{writer.get_code()}, {writer.get_code()},
function_name, function_name,
parameters_1inp_1out, get_kernel_args(1, 1),
string("-DCOUNT=" + to_string(input_count)), string("-DCOUNT=" + to_string(input_count)),
layout, layout,
{1}); {1});
...@@ -170,7 +141,7 @@ void runtime::intelgpu::do_bcast_sum_operation(cldnn::topology& topology, ...@@ -170,7 +141,7 @@ void runtime::intelgpu::do_bcast_sum_operation(cldnn::topology& topology,
{input_name}, {input_name},
{writer.get_code()}, {writer.get_code()},
function_name, function_name,
parameters_1inp_1out, get_kernel_args(1, 1),
"", "",
layout, layout,
{1}); {1});
......
...@@ -27,12 +27,24 @@ ...@@ -27,12 +27,24 @@
using namespace std; using namespace std;
using namespace ngraph; using namespace ngraph;
static vector<cldnn_arg> parameters_1inp_1out = {{arg_input, 0}, {arg_output, 0}}; vector<cldnn_arg> runtime::intelgpu::get_kernel_args(size_t input, size_t output)
static vector<cldnn_arg> parameters_2inp_1out = {{arg_input, 0}, {arg_input, 1}, {arg_output, 0}}; {
static vector<cldnn_arg> parameters_3inp_1out = { vector<cldnn_arg> result;
{arg_input, 0}, {arg_input, 1}, {arg_input, 2}, {arg_output, 0}};
for (cldnn_arg_index i = 0; i < input; ++i)
{
result.push_back({arg_input, i});
}
for (cldnn_arg_index i = 0; i < output; ++i)
{
result.push_back({arg_output, i});
}
static string array_dims(const Shape& dimentions) return result;
}
string runtime::intelgpu::array_dims(const Shape& dimentions)
{ {
string buffer; string buffer;
...@@ -44,7 +56,7 @@ static string array_dims(const Shape& dimentions) ...@@ -44,7 +56,7 @@ static string array_dims(const Shape& dimentions)
return buffer; return buffer;
} }
static string access_dims(const Shape& dimentions, const AxisSet& axis = {}) string runtime::intelgpu::access_dims(const Shape& dimentions, const AxisSet& axis)
{ {
size_t var_idx = 0; size_t var_idx = 0;
string buffer; string buffer;
...@@ -85,9 +97,10 @@ static string access_dims_strided(const Shape& dimentions, ...@@ -85,9 +97,10 @@ static string access_dims_strided(const Shape& dimentions,
static void do_dot_operation_error(const Shape& shapeA, const Shape& shapeB, const Shape& shapeZ) static void do_dot_operation_error(const Shape& shapeA, const Shape& shapeB, const Shape& shapeZ)
{ {
throw invalid_argument("IntelGPU Dot operation. Conbination ShapeA" + array_dims(shapeA) + throw invalid_argument("IntelGPU Dot operation. Conbination ShapeA" +
", ShapeB" + array_dims(shapeB) + ", ShapeOutput" + array_dims(shapeZ) + runtime::intelgpu::array_dims(shapeA) + ", ShapeB" +
" is not supported."); runtime::intelgpu::array_dims(shapeB) + ", ShapeOutput" +
runtime::intelgpu::array_dims(shapeZ) + " is not supported.");
} }
void runtime::intelgpu::do_pad_operation(cldnn::topology& topology, void runtime::intelgpu::do_pad_operation(cldnn::topology& topology,
...@@ -100,7 +113,7 @@ void runtime::intelgpu::do_pad_operation(cldnn::topology& topology, ...@@ -100,7 +113,7 @@ void runtime::intelgpu::do_pad_operation(cldnn::topology& topology,
const Shape& pad_below, const Shape& pad_below,
const Shape& pad_interior) const Shape& pad_interior)
{ {
const string entry_point_name = "op_pad_kernel"; const string entry_point_name = "op_pad_kernel_" + output_name;
codegen::CodeWriter writer; codegen::CodeWriter writer;
// The kernel name and parameters // The kernel name and parameters
...@@ -156,7 +169,7 @@ void runtime::intelgpu::do_pad_operation(cldnn::topology& topology, ...@@ -156,7 +169,7 @@ void runtime::intelgpu::do_pad_operation(cldnn::topology& topology,
{input_name, scalar_name}, {input_name, scalar_name},
{writer.get_code()}, {writer.get_code()},
entry_point_name, entry_point_name,
parameters_2inp_1out, get_kernel_args(2, 1),
"", "",
layout, layout,
{1}); {1});
...@@ -171,7 +184,7 @@ static void do_1d_scalar_mul(codegen::CodeWriter& writer, ...@@ -171,7 +184,7 @@ static void do_1d_scalar_mul(codegen::CodeWriter& writer,
const size_t countA = shapeA.empty() ? 0 : shape_size<Shape>(shapeA); const size_t countA = shapeA.empty() ? 0 : shape_size<Shape>(shapeA);
const size_t countB = shapeB.empty() ? 0 : shape_size<Shape>(shapeB); const size_t countB = shapeB.empty() ? 0 : shape_size<Shape>(shapeB);
const size_t countZ = max(countA, countB); const size_t countZ = max(countA, countB);
kernel_name = "do_1d_scalar_mul"; kernel_name += "_do_1d_scalar_mul";
writer << "__kernel void " << kernel_name << "(const __global float* inputA" writer << "__kernel void " << kernel_name << "(const __global float* inputA"
<< ", const __global float* inputB, __global float* output)\n"; << ", const __global float* inputB, __global float* output)\n";
...@@ -196,11 +209,12 @@ static void do_2d_2d_mul(codegen::CodeWriter& writer, ...@@ -196,11 +209,12 @@ static void do_2d_2d_mul(codegen::CodeWriter& writer,
const size_t rows = shapeA.at(0); const size_t rows = shapeA.at(0);
const size_t colrow = shapeA.at(1); const size_t colrow = shapeA.at(1);
const size_t cols = shapeB.back(); const size_t cols = shapeB.back();
kernel_name = "do_2d_2d_mul"; kernel_name += "_do_2d_2d_mul";
writer << "__kernel void " << kernel_name << "(const __global float inputA" writer << "__kernel void " << kernel_name << "(const __global float inputA"
<< array_dims(shapeA) << ", const __global float inputB" << array_dims(shapeB) << runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB"
<< ", __global float output" << array_dims({rows, cols}) << ")\n"; << runtime::intelgpu::array_dims(shapeB) << ", __global float output"
<< runtime::intelgpu::array_dims({rows, cols}) << ")\n";
writer.block_begin(); writer.block_begin();
{ {
size_t var_idx = 0; size_t var_idx = 0;
...@@ -239,11 +253,12 @@ static void do_3d_3d_mul(codegen::CodeWriter& writer, ...@@ -239,11 +253,12 @@ static void do_3d_3d_mul(codegen::CodeWriter& writer,
const Shape& shapeZ) const Shape& shapeZ)
{ {
const size_t colrow = shapeA.back(); const size_t colrow = shapeA.back();
kernel_name = "do_3d_3d_mul"; kernel_name += "_do_3d_3d_mul";
writer << "__kernel void " << kernel_name << "(const __global float inputA" writer << "__kernel void " << kernel_name << "(const __global float inputA"
<< array_dims(shapeA) << ", const __global float inputB" << array_dims(shapeB) << runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB"
<< ", __global float output" << array_dims(shapeZ) << ")\n"; << runtime::intelgpu::array_dims(shapeB) << ", __global float output"
<< runtime::intelgpu::array_dims(shapeZ) << ")\n";
writer.block_begin(); writer.block_begin();
{ {
size_t var_idx = 0; size_t var_idx = 0;
...@@ -282,11 +297,12 @@ static void do_3d_2d_mul(codegen::CodeWriter& writer, ...@@ -282,11 +297,12 @@ static void do_3d_2d_mul(codegen::CodeWriter& writer,
const Shape& shapeZ) const Shape& shapeZ)
{ {
const size_t colrow = shapeA.back(); const size_t colrow = shapeA.back();
kernel_name = "do_3d_2d_mul"; kernel_name += "_do_3d_2d_mul";
writer << "__kernel void " << kernel_name << "(const __global float inputA" writer << "__kernel void " << kernel_name << "(const __global float inputA"
<< array_dims(shapeA) << ", const __global float inputB" << array_dims(shapeB) << runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB"
<< ", __global float output" << array_dims(shapeZ) << ")\n"; << runtime::intelgpu::array_dims(shapeB) << ", __global float output"
<< runtime::intelgpu::array_dims(shapeZ) << ")\n";
writer.block_begin(); writer.block_begin();
{ {
size_t var_idx = 0; size_t var_idx = 0;
...@@ -325,11 +341,12 @@ static void do_2d_1d_mul(codegen::CodeWriter& writer, ...@@ -325,11 +341,12 @@ static void do_2d_1d_mul(codegen::CodeWriter& writer,
{ {
const size_t rows = shapeA.at(0); const size_t rows = shapeA.at(0);
const size_t colrow = shapeA.at(1); const size_t colrow = shapeA.at(1);
kernel_name = "do_2d_1d_mul"; kernel_name += "_do_2d_1d_mul";
writer << "__kernel void " << kernel_name << "(const __global float inputA" writer << "__kernel void " << kernel_name << "(const __global float inputA"
<< array_dims(shapeA) << ", const __global float inputB" << array_dims(shapeB) << runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB"
<< ", __global float output" << array_dims({rows}) << ")\n"; << runtime::intelgpu::array_dims(shapeB) << ", __global float output"
<< runtime::intelgpu::array_dims({rows}) << ")\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "for (uint i0 = 0; i0 < " << rows << "; ++i0)\n"; writer << "for (uint i0 = 0; i0 < " << rows << "; ++i0)\n";
...@@ -351,7 +368,7 @@ static void do_2d_1d_mul(codegen::CodeWriter& writer, ...@@ -351,7 +368,7 @@ static void do_2d_1d_mul(codegen::CodeWriter& writer,
static void do_scalar_scalar_mul(codegen::CodeWriter& writer, string& kernel_name) static void do_scalar_scalar_mul(codegen::CodeWriter& writer, string& kernel_name)
{ {
kernel_name = "scalar_scalar_mul"; kernel_name += "_scalar_scalar_mul";
writer << "__kernel void " << kernel_name << "(const __global float inputA[1]" writer << "__kernel void " << kernel_name << "(const __global float inputA[1]"
<< ", const __global float inputB[1], __global float output[1])\n"; << ", const __global float inputB[1], __global float output[1])\n";
...@@ -366,15 +383,16 @@ static void do_1d_1d_mul(codegen::CodeWriter& writer, string& kernel_name, const ...@@ -366,15 +383,16 @@ static void do_1d_1d_mul(codegen::CodeWriter& writer, string& kernel_name, const
{ {
if (shape.size() > 1) if (shape.size() > 1)
{ {
throw invalid_argument("do_1d_1d_mul: Shape" + array_dims(shape) + " must be 1D"); throw invalid_argument("do_1d_1d_mul: Shape" + runtime::intelgpu::array_dims(shape) +
" must be 1D");
} }
const size_t& size = shape.front(); const size_t& size = shape.front();
kernel_name = "do_1d_1d_mul"; kernel_name += "_do_1d_1d_mul";
writer << "__kernel void " << kernel_name << "(const __global float inputA" << array_dims(shape) writer << "__kernel void " << kernel_name << "(const __global float inputA"
<< ", const __global float inputB" << array_dims(shape) << runtime::intelgpu::array_dims(shape) << ", const __global float inputB"
<< ", __global float output[1])\n"; << runtime::intelgpu::array_dims(shape) << ", __global float output[1])\n";
writer.block_begin(); writer.block_begin();
{ {
writer << "float sum = 0.0f;\n" writer << "float sum = 0.0f;\n"
...@@ -399,7 +417,7 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology, ...@@ -399,7 +417,7 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology,
const element::Type& output_type) const element::Type& output_type)
{ {
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape); const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
string entry_point_name = "dot_unknown"; string entry_point_name = "dot_" + output_name;
codegen::CodeWriter writer; codegen::CodeWriter writer;
const bool A_is_scalar = inputA_shape.empty(); const bool A_is_scalar = inputA_shape.empty();
...@@ -451,7 +469,7 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology, ...@@ -451,7 +469,7 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology,
{inputA_name, inputB_name}, {inputA_name, inputB_name},
{writer.get_code()}, {writer.get_code()},
entry_point_name, entry_point_name,
parameters_2inp_1out, get_kernel_args(2, 1),
"", "",
layout, layout,
{1}); {1});
...@@ -469,7 +487,7 @@ void runtime::intelgpu::do_slice_operation(cldnn::topology& topology, ...@@ -469,7 +487,7 @@ void runtime::intelgpu::do_slice_operation(cldnn::topology& topology,
const Strides& strides) const Strides& strides)
{ {
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape); const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
string entry_point_name = "slice_unknown"; const string entry_point_name = "slice_" + output_name;
codegen::CodeWriter writer; codegen::CodeWriter writer;
writer << "__kernel void " << entry_point_name << "(const __global float input" writer << "__kernel void " << entry_point_name << "(const __global float input"
...@@ -502,7 +520,7 @@ void runtime::intelgpu::do_slice_operation(cldnn::topology& topology, ...@@ -502,7 +520,7 @@ void runtime::intelgpu::do_slice_operation(cldnn::topology& topology,
{input_name}, {input_name},
{writer.get_code()}, {writer.get_code()},
entry_point_name, entry_point_name,
parameters_1inp_1out, get_kernel_args(1, 1),
"", "",
layout, layout,
{1}); {1});
...@@ -521,7 +539,7 @@ void runtime::intelgpu::do_select_operation(cldnn::topology& topology, ...@@ -521,7 +539,7 @@ void runtime::intelgpu::do_select_operation(cldnn::topology& topology,
const element::Type& output_type) const element::Type& output_type)
{ {
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape); const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
string entry_point_name = "select" + output_name; const string entry_point_name = "select_" + output_name;
codegen::CodeWriter writer; codegen::CodeWriter writer;
writer << "__kernel void " << entry_point_name << "(const __global char input0" writer << "__kernel void " << entry_point_name << "(const __global char input0"
...@@ -569,7 +587,7 @@ void runtime::intelgpu::do_select_operation(cldnn::topology& topology, ...@@ -569,7 +587,7 @@ void runtime::intelgpu::do_select_operation(cldnn::topology& topology,
{input0_name, input1_name, input2_name}, {input0_name, input1_name, input2_name},
{writer.get_code()}, {writer.get_code()},
entry_point_name, entry_point_name,
parameters_3inp_1out, get_kernel_args(3, 1),
"", "",
layout, layout,
{1}); {1});
......
...@@ -69,6 +69,11 @@ namespace ngraph ...@@ -69,6 +69,11 @@ namespace ngraph
const std::string& output_name, const std::string& output_name,
const Shape& output_shape, const Shape& output_shape,
const element::Type& output_type); const element::Type& output_type);
// Helper functions used in cldnn::custom_gpu_primitive kernels
std::vector<cldnn_arg> get_kernel_args(size_t input, size_t output);
std::string array_dims(const Shape& dimentions);
std::string access_dims(const Shape& dimentions, const AxisSet& axis = {});
} }
} }
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment