Commit 5927bbe4 authored by shssf's avatar shssf Committed by Robert Kimball

IntelGPU backend: Dot operation (partially implemented) (#1275)

* IntelGPU backend: Dot operation (partially implemented)

* PR1275. Debug output deleted.

* PR1275. Comments addressed
parent c007740b
......@@ -219,6 +219,28 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
const cldnn::data op_const(output_name, mem);
topology.add(op_const);
}
else if ("Dot" == op->description())
{
arguments_check(op, 2, 1);
const string& inputA_name = op->get_inputs().at(0).get_tensor().get_name();
const Shape& inputA_shape = op->get_inputs().at(0).get_shape();
const string& inputB_name = op->get_inputs().at(1).get_tensor().get_name();
const Shape& inputB_shape = op->get_inputs().at(1).get_shape();
const string& output_name = op->get_outputs().begin()->get_tensor().get_name();
const Shape& output_shape = op->get_outputs().begin()->get_shape();
const element::Type& output_type =
op->get_outputs().begin()->get_tensor().get_element_type();
do_dot_operation(topology,
inputA_name,
inputA_shape,
inputB_name,
inputB_shape,
output_name,
output_shape,
output_type);
}
else if ("MaxPool" == op->description())
{
arguments_check(op, 1, 1);
......@@ -349,15 +371,15 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
const Shape& pad_below = pad->get_padding_below();
const Shape& pad_interior = pad->get_padding_interior();
do_pad_kernel(topology,
input_name,
input_shape,
scalar_name,
output_name,
output_shape,
output_type,
pad_below,
pad_interior);
do_pad_operation(topology,
input_name,
input_shape,
scalar_name,
output_name,
output_shape,
output_type,
pad_below,
pad_interior);
}
else if ("BatchNorm" == op->description())
{
......
......@@ -14,6 +14,7 @@
* limitations under the License.
*******************************************************************************/
#include <CPP/concatenation.hpp>
#include <CPP/custom_gpu_primitive.hpp>
#include <CPP/reshape.hpp>
......@@ -45,12 +46,13 @@ static string access_dims(const Shape& dimentions, const AxisSet& axis = {})
size_t var_idx = 0;
string buffer;
for (auto i = dimentions.cbegin(); i != dimentions.cend(); ++i, ++var_idx)
for (auto const& i : dimentions)
{
if (axis.find(var_idx) == axis.end())
{
buffer += "[i" + to_string(var_idx) + "]";
}
++var_idx;
}
return buffer;
......@@ -62,26 +64,33 @@ static string
string buffer;
size_t var_idx = 0;
for (auto i = dimentions.cbegin(); i != dimentions.cend(); ++i, ++var_idx)
for (auto const& i : dimentions)
{
buffer += "[i" + to_string(var_idx) + " * (" + to_string(pad_interior.at(var_idx)) +
" + 1) + " + to_string(pad_below.at(var_idx)) + "]";
++var_idx;
}
return buffer;
}
void runtime::intelgpu::do_pad_kernel(cldnn::topology& topology,
const string& input_name,
const Shape& input_shape,
const string& scalar_name,
const string& output_name,
const Shape& output_shape,
const element::Type& output_type,
const Shape& pad_below,
const Shape& pad_interior)
static void do_dot_operation_error(const Shape& shapeA, const Shape& shapeB, const Shape& shapeZ)
{
throw invalid_argument("IntelGPU Dot operation. Conbination ShapeA" + array_dims(shapeA) +
", ShapeB" + array_dims(shapeB) + ", ShapeOutput" + array_dims(shapeZ) +
" is not supported.");
}
void runtime::intelgpu::do_pad_operation(cldnn::topology& topology,
const string& input_name,
const Shape& input_shape,
const string& scalar_name,
const string& output_name,
const Shape& output_shape,
const element::Type& output_type,
const Shape& pad_below,
const Shape& pad_interior)
{
const size_t input_count = shape_size<Shape>(output_shape);
const string entry_point_name = "op_pad_kernel";
codegen::CodeWriter writer;
......@@ -94,11 +103,12 @@ void runtime::intelgpu::do_pad_kernel(cldnn::topology& topology,
{
// Loop for Broadcast scalar over full output tensor
size_t var_idx = 0;
for (auto i = output_shape.cbegin(); i != output_shape.cend(); ++i, ++var_idx)
for (auto const& i : output_shape)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << *i << "; ++i"
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
writer << "output" << access_dims(output_shape) << " = scalar[0];\n";
......@@ -112,11 +122,12 @@ void runtime::intelgpu::do_pad_kernel(cldnn::topology& topology,
// Loop for Copy input matrix into output matrix with padding.
// Padding include "pad_below" and "pad_interior" according nGraph documentation
var_idx = 0;
for (auto i = input_shape.cbegin(); i != input_shape.cend(); ++i, ++var_idx)
for (auto const& i : input_shape)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << *i << "; ++i"
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
writer << "output" << access_dims_strided(input_shape, pad_below, pad_interior)
......@@ -138,7 +149,300 @@ void runtime::intelgpu::do_pad_kernel(cldnn::topology& topology,
entry_point_name,
parameters_2inp_1out,
"",
layout,
{1});
layout);
topology.add(op_scalar);
}
static void do_1d_scalar_mul(codegen::CodeWriter& writer,
string& kernel_name,
const Shape& shapeA,
const Shape& shapeB)
{
const size_t countA = shapeA.empty() ? 0 : shape_size<Shape>(shapeA);
const size_t countB = shapeB.empty() ? 0 : shape_size<Shape>(shapeB);
const size_t countZ = max(countA, countB);
kernel_name = "do_1d_scalar_mul";
writer << "__kernel void " << kernel_name << "(const __global float* inputA"
<< ", const __global float* inputB, __global float* output)\n";
writer.block_begin();
{
writer << "for (uint i1 = 0; i1 < " << countZ << "; ++i1)\n";
writer.block_begin();
{
writer << "output[i1] = inputA[" << (countA > 0 ? "i1" : "0") << "] * inputB["
<< (countB > 0 ? "i1" : "0") << "];\n";
}
writer.block_end();
}
writer.block_end();
}
static void do_2d_2d_mul(codegen::CodeWriter& writer,
string& kernel_name,
const Shape& shapeA,
const Shape& shapeB)
{
const size_t rows = shapeA.at(0);
const size_t colrow = shapeA.at(1);
const size_t cols = shapeB.back();
kernel_name = "do_2d_2d_mul";
writer << "__kernel void " << kernel_name << "(const __global float inputA"
<< array_dims(shapeA) << ", const __global float inputB" << array_dims(shapeB)
<< ", __global float output" << array_dims({rows, cols}) << ")\n";
writer.block_begin();
{
size_t var_idx = 0;
// Main loops
for (auto const& i : shapeA)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
// Inner loop
writer << "float sum = 0.0f;\n";
writer << "for (uint i2 = 0; i2 < " << colrow << "; ++i2)\n";
writer.block_begin();
{
writer << "sum += inputA[i0][i2] * inputB[i2][i1];\n";
}
writer.block_end();
writer << "output[i0][i1] = sum;\n";
// Closing brackets for main loops
for (auto const& i : shapeA)
{
writer.block_end();
}
}
writer.block_end();
}
static void do_3d_3d_mul(codegen::CodeWriter& writer,
string& kernel_name,
const Shape& shapeA,
const Shape& shapeB,
const Shape& shapeZ)
{
const size_t colrow = shapeA.back();
kernel_name = "do_3d_3d_mul";
writer << "__kernel void " << kernel_name << "(const __global float inputA"
<< array_dims(shapeA) << ", const __global float inputB" << array_dims(shapeB)
<< ", __global float output" << array_dims(shapeZ) << ")\n";
writer.block_begin();
{
size_t var_idx = 0;
// Main loops
for (auto const& i : shapeZ)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
// Inner loop
writer << "float sum = 0.0f;\n";
writer << "for (uint i4 = 0; i4 < " << colrow << "; ++i4)\n";
writer.block_begin();
{
writer << "sum += inputA[i0][i1][i4] * inputB[i4][i2][i3];\n";
}
writer.block_end();
writer << "output[i0][i1][i2][i3] = sum;\n";
// Closing brackets for main loops
for (auto const& i : shapeZ)
{
writer.block_end();
}
}
writer.block_end();
}
static void do_3d_2d_mul(codegen::CodeWriter& writer,
string& kernel_name,
const Shape& shapeA,
const Shape& shapeB,
const Shape& shapeZ)
{
const size_t colrow = shapeA.back();
kernel_name = "do_3d_2d_mul";
writer << "__kernel void " << kernel_name << "(const __global float inputA"
<< array_dims(shapeA) << ", const __global float inputB" << array_dims(shapeB)
<< ", __global float output" << array_dims(shapeZ) << ")\n";
writer.block_begin();
{
size_t var_idx = 0;
// Main loops
for (auto const& i : shapeZ)
{
writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
<< var_idx << ")\n";
writer.block_begin();
++var_idx;
}
// Inner loop
writer << "float sum = 0.0f;\n";
writer << "for (uint i3 = 0; i3 < " << colrow << "; ++i3)\n";
writer.block_begin();
{
writer << "sum += inputA[i0][i1][i3] * inputB[i3][i2];\n";
}
writer.block_end();
writer << "output[i0][i1][i2] = sum;\n";
// Closing brackets for main loops
for (auto const& i : shapeZ)
{
writer.block_end();
}
}
writer.block_end();
}
static void do_2d_1d_mul(codegen::CodeWriter& writer,
string& kernel_name,
const Shape& shapeA,
const Shape& shapeB)
{
const size_t rows = shapeA.at(0);
const size_t colrow = shapeA.at(1);
kernel_name = "do_2d_1d_mul";
writer << "__kernel void " << kernel_name << "(const __global float inputA"
<< array_dims(shapeA) << ", const __global float inputB" << array_dims(shapeB)
<< ", __global float output" << array_dims({rows}) << ")\n";
writer.block_begin();
{
writer << "for (uint i0 = 0; i0 < " << rows << "; ++i0)\n";
writer.block_begin();
{
writer << "float sum = 0.0f;\n";
writer << "for (uint i1 = 0; i1 < " << colrow << "; ++i1)\n";
writer.block_begin();
{
writer << "sum += inputA[i0][i1] * inputB[i1];\n";
}
writer.block_end();
writer << "output[i0] = sum;\n";
}
writer.block_end();
}
writer.block_end();
}
static void do_scalar_scalar_mul(codegen::CodeWriter& writer, string& kernel_name)
{
kernel_name = "scalar_scalar_mul";
writer << "__kernel void " << kernel_name << "(const __global float inputA[1]"
<< ", const __global float inputB[1], __global float output[1])\n";
writer.block_begin();
{
writer << "output[0] = inputA[0] * inputB[0];\n";
}
writer.block_end();
}
static void do_1d_1d_mul(codegen::CodeWriter& writer, string& kernel_name, const Shape& shape)
{
if (shape.size() > 1)
{
throw invalid_argument("do_1d_1d_mul: Shape" + array_dims(shape) + " must be 1D");
}
const size_t& size = shape.front();
kernel_name = "do_1d_1d_mul";
writer << "__kernel void " << kernel_name << "(const __global float inputA" << array_dims(shape)
<< ", const __global float inputB" << array_dims(shape)
<< ", __global float output[1])\n";
writer.block_begin();
{
writer << "float sum = 0.0f;\n"
<< "for (uint i = 0; i < " << size << "; ++i)\n";
writer.block_begin();
{
writer << "sum += inputA[i] * inputB[i];\n";
}
writer.block_end();
writer << "output[0] = sum;\n";
}
writer.block_end();
}
void runtime::intelgpu::do_dot_operation(cldnn::topology& topology,
const string& inputA_name,
const Shape& inputA_shape,
const string& inputB_name,
const Shape& inputB_shape,
const string& output_name,
const Shape& output_shape,
const element::Type& output_type)
{
const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
string entry_point_name = "dot_unknown";
codegen::CodeWriter writer;
const bool A_is_scalar = inputA_shape.empty();
const bool B_is_scalar = inputB_shape.empty();
const bool Z_is_scalar = output_shape.empty();
if (A_is_scalar && B_is_scalar && Z_is_scalar)
{
do_scalar_scalar_mul(writer, entry_point_name);
}
else if (((A_is_scalar && !B_is_scalar) || (!A_is_scalar && B_is_scalar)) && !Z_is_scalar)
{
do_1d_scalar_mul(writer, entry_point_name, inputA_shape, inputB_shape);
}
else if (!A_is_scalar && !B_is_scalar && Z_is_scalar)
{
do_1d_1d_mul(writer, entry_point_name, inputB_shape);
}
else if (!A_is_scalar && !B_is_scalar && !Z_is_scalar)
{
if (inputA_shape.size() == 2 && inputB_shape.size() == 1)
{
do_2d_1d_mul(writer, entry_point_name, inputA_shape, inputB_shape);
}
else if (inputA_shape.size() == 2 && inputB_shape.size() == 2)
{
do_2d_2d_mul(writer, entry_point_name, inputA_shape, inputB_shape);
}
else if (inputA_shape.size() == 3 && inputB_shape.size() == 3)
{
do_3d_3d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape);
}
else if (inputA_shape.size() == 3 && inputB_shape.size() == 2)
{
do_3d_2d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape);
}
else
{
do_dot_operation_error(inputA_shape, inputB_shape, output_shape);
}
}
else
{
do_dot_operation_error(inputA_shape, inputB_shape, output_shape);
}
//cout << writer.get_code() << endl;
const cldnn::custom_gpu_primitive op_dot(output_name,
{inputA_name, inputB_name},
{writer.get_code()},
entry_point_name,
parameters_2inp_1out,
"",
layout);
topology.add(op_dot);
}
......@@ -28,15 +28,24 @@ namespace ngraph
{
namespace intelgpu
{
void do_pad_kernel(cldnn::topology& topology,
const std::string& input_name,
const Shape& input_shape,
const std::string& scalar_name,
const std::string& output_name,
const Shape& output_shape,
const element::Type& output_type,
const Shape& pad_below,
const Shape& pad_interior);
void do_pad_operation(cldnn::topology& topology,
const std::string& input_name,
const Shape& input_shape,
const std::string& scalar_name,
const std::string& output_name,
const Shape& output_shape,
const element::Type& output_type,
const Shape& pad_below,
const Shape& pad_interior);
void do_dot_operation(cldnn::topology& topology,
const std::string& inputA_name,
const Shape& inputA_shape,
const std::string& inputB_name,
const Shape& inputB_shape,
const std::string& output_name,
const Shape& output_shape,
const element::Type& output_type);
}
}
}
......@@ -34,13 +34,11 @@ backwards_concat_axis_1
backwards_concat_vector
backwards_cos
backwards_cosh
backwards_dot_scalar_scalar
backwards_dot_scalar_tensor
backwards_dot_tensor2_tensor2
backwards_dot_tensor3_tensor3
backwards_dot_tensor_scalar
backwards_dot_tensor_vector
backwards_dot_vector_vector
backwards_exp
backwards_floor
backwards_log
......@@ -139,21 +137,12 @@ cos
cosh
divide_by_zero_int32
dot_0_0
dot1d
dot2d
dot_2x0_0
dot3d_2d
dot3d_3d
dot_matrix_0x2_2x0
dot_matrix_2x0_0x2
dot_matrix_3x2_2x0
dot_matrix_vector
dot_matrix_vector_4_3
dot_matrix_vector_int64
dot_scalar_0x2
dot_scalar_scalar
dot_scalar_tensor_arg0
dot_scalar_tensor_arg1
equal
exp
floor
......@@ -216,14 +205,9 @@ one_hot_vector_1_barely_oob
one_hot_vector_1_far_oob
one_hot_vector_1_fp
one_hot_vector_1_fp_nonint
pad_exterior_1d
pad_exterior_2d_0x0
pad_exterior_2d_0x3
pad_exterior_2d_3x0
pad_exterior_4d_1x2x2x2
pad_interior_1d
pad_interior_exterior_1d
pad_interior_exterior_2d
pad_interior_exterior_4d_2x0x3x2
power
product_3d_eliminate_zero_dim
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment