IntelGPU backend: Dot operation (partially implemented) (#1275)

* IntelGPU backend: Dot operation (partially implemented) * PR1275. Debug output deleted. * PR1275. Comments addressed

IntelGPU backend: Dot operation (partially implemented) (#1275)
* IntelGPU backend: Dot operation (partially implemented) * PR1275. Debug output deleted. * PR1275. Comments addressed
5927bbe4 · shssf · Robert Kimball · c007740b · 5927bbe4 · 5927bbe4
Commit 5927bbe4 authored Jul 29, 2018 by shssf Committed by Robert Kimball Jul 29, 2018
4 changed files
--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -219,6 +219,28 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
            const cldnn::data op_const(output_name, mem);
            topology.add(op_const);
        }
+        else if ("Dot" == op->description())
+        {
+            arguments_check(op, 2, 1);
+            const string& inputA_name = op->get_inputs().at(0).get_tensor().get_name();
+            const Shape& inputA_shape = op->get_inputs().at(0).get_shape();
+            const string& inputB_name = op->get_inputs().at(1).get_tensor().get_name();
+            const Shape& inputB_shape = op->get_inputs().at(1).get_shape();
+            const string& output_name = op->get_outputs().begin()->get_tensor().get_name();
+            const Shape& output_shape = op->get_outputs().begin()->get_shape();
+            const element::Type& output_type =
+                op->get_outputs().begin()->get_tensor().get_element_type();
+            do_dot_operation(topology,
+                             inputA_name,
+                             inputA_shape,
+                             inputB_name,
+                             inputB_shape,
+                             output_name,
+                             output_shape,
+                             output_type);
+        }
        else if ("MaxPool" == op->description())
        {
            arguments_check(op, 1, 1);
@@ -349,15 +371,15 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
            const Shape& pad_below = pad->get_padding_below();
            const Shape& pad_interior = pad->get_padding_interior();
-            do_pad_kernel(topology,
+            do_pad_operation(topology,
-                          input_name,
+                             input_name,
-                          input_shape,
+                             input_shape,
-                          scalar_name,
+                             scalar_name,
-                          output_name,
+                             output_name,
-                          output_shape,
+                             output_shape,
-                          output_type,
+                             output_type,
-                          pad_below,
+                             pad_below,
-                          pad_interior);
+                             pad_interior);
        }
        else if ("BatchNorm" == op->description())
        {

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
@@ -14,6 +14,7 @@
 * limitations under the License.
 *******************************************************************************/
+#include <CPP/concatenation.hpp>
 #include <CPP/custom_gpu_primitive.hpp>
 #include <CPP/reshape.hpp>
@@ -45,12 +46,13 @@ static string access_dims(const Shape& dimentions, const AxisSet& axis = {})
    size_t var_idx = 0;
    string buffer;
-    for (auto i = dimentions.cbegin(); i != dimentions.cend(); ++i, ++var_idx)
+    for (auto const& i : dimentions)
    {
        if (axis.find(var_idx) == axis.end())
        {
            buffer += "[i" + to_string(var_idx) + "]";
        }
+        ++var_idx;
    }
    return buffer;
@@ -62,26 +64,33 @@ static string
    string buffer;
    size_t var_idx = 0;
-    for (auto i = dimentions.cbegin(); i != dimentions.cend(); ++i, ++var_idx)
+    for (auto const& i : dimentions)
    {
        buffer += "[i" + to_string(var_idx) + " * (" + to_string(pad_interior.at(var_idx)) +
                  " + 1) + " + to_string(pad_below.at(var_idx)) + "]";
+        ++var_idx;
    }
    return buffer;
 }
-void runtime::intelgpu::do_pad_kernel(cldnn::topology& topology,
+static void do_dot_operation_error(const Shape& shapeA, const Shape& shapeB, const Shape& shapeZ)
-                                      const string& input_name,
+{
-                                      const Shape& input_shape,
+    throw invalid_argument("IntelGPU Dot operation. Conbination ShapeA" + array_dims(shapeA) +
-                                      const string& scalar_name,
+                           ", ShapeB" + array_dims(shapeB) + ", ShapeOutput" + array_dims(shapeZ) +
-                                      const string& output_name,
+                           " is not supported.");
-                                      const Shape& output_shape,
+}
-                                      const element::Type& output_type,
-                                      const Shape& pad_below,
+void runtime::intelgpu::do_pad_operation(cldnn::topology& topology,
-                                      const Shape& pad_interior)
+                                         const string& input_name,
+                                         const Shape& input_shape,
+                                         const string& scalar_name,
+                                         const string& output_name,
+                                         const Shape& output_shape,
+                                         const element::Type& output_type,
+                                         const Shape& pad_below,
+                                         const Shape& pad_interior)
 {
-    const size_t input_count = shape_size<Shape>(output_shape);
    const string entry_point_name = "op_pad_kernel";
    codegen::CodeWriter writer;
@@ -94,11 +103,12 @@ void runtime::intelgpu::do_pad_kernel(cldnn::topology& topology,
    {
        // Loop for Broadcast scalar over full output tensor
        size_t var_idx = 0;
-        for (auto i = output_shape.cbegin(); i != output_shape.cend(); ++i, ++var_idx)
+        for (auto const& i : output_shape)
        {
-            writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << *i << "; ++i"
+            writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
                   << var_idx << ")\n";
            writer.block_begin();
+            ++var_idx;
        }
        writer << "output" << access_dims(output_shape) << " = scalar[0];\n";
@@ -112,11 +122,12 @@ void runtime::intelgpu::do_pad_kernel(cldnn::topology& topology,
        // Loop for Copy input matrix into output matrix with padding.
        // Padding include "pad_below" and "pad_interior" according nGraph documentation
        var_idx = 0;
-        for (auto i = input_shape.cbegin(); i != input_shape.cend(); ++i, ++var_idx)
+        for (auto const& i : input_shape)
        {
-            writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << *i << "; ++i"
+            writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
                   << var_idx << ")\n";
            writer.block_begin();
+            ++var_idx;
        }
        writer << "output" << access_dims_strided(input_shape, pad_below, pad_interior)
@@ -138,7 +149,300 @@ void runtime::intelgpu::do_pad_kernel(cldnn::topology& topology,
                                                entry_point_name,
                                                parameters_2inp_1out,
                                                "",
-                                                layout,
+                                                layout);
-                                                {1});
    topology.add(op_scalar);
 }
+static void do_1d_scalar_mul(codegen::CodeWriter& writer,
+                             string& kernel_name,
+                             const Shape& shapeA,
+                             const Shape& shapeB)
+{
+    const size_t countA = shapeA.empty() ? 0 : shape_size<Shape>(shapeA);
+    const size_t countB = shapeB.empty() ? 0 : shape_size<Shape>(shapeB);
+    const size_t countZ = max(countA, countB);
+    kernel_name = "do_1d_scalar_mul";
+    writer << "__kernel void " << kernel_name << "(const __global float* inputA"
+           << ", const __global float* inputB, __global float* output)\n";
+    writer.block_begin();
+    {
+        writer << "for (uint i1 = 0; i1 < " << countZ << "; ++i1)\n";
+        writer.block_begin();
+        {
+            writer << "output[i1] = inputA[" << (countA > 0 ? "i1" : "0") << "] * inputB["
+                   << (countB > 0 ? "i1" : "0") << "];\n";
+        }
+        writer.block_end();
+    }
+    writer.block_end();
+}
+static void do_2d_2d_mul(codegen::CodeWriter& writer,
+                         string& kernel_name,
+                         const Shape& shapeA,
+                         const Shape& shapeB)
+{
+    const size_t rows = shapeA.at(0);
+    const size_t colrow = shapeA.at(1);
+    const size_t cols = shapeB.back();
+    kernel_name = "do_2d_2d_mul";
+    writer << "__kernel void " << kernel_name << "(const __global float inputA"
+           << array_dims(shapeA) << ", const __global float inputB" << array_dims(shapeB)
+           << ", __global float output" << array_dims({rows, cols}) << ")\n";
+    writer.block_begin();
+    {
+        size_t var_idx = 0;
+        // Main loops
+        for (auto const& i : shapeA)
+        {
+            writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
+                   << var_idx << ")\n";
+            writer.block_begin();
+            ++var_idx;
+        }
+        // Inner loop
+        writer << "float sum = 0.0f;\n";
+        writer << "for (uint i2 = 0; i2 < " << colrow << "; ++i2)\n";
+        writer.block_begin();
+        {
+            writer << "sum += inputA[i0][i2] * inputB[i2][i1];\n";
+        }
+        writer.block_end();
+        writer << "output[i0][i1] = sum;\n";
+        // Closing brackets for main loops
+        for (auto const& i : shapeA)
+        {
+            writer.block_end();
+        }
+    }
+    writer.block_end();
+}
+static void do_3d_3d_mul(codegen::CodeWriter& writer,
+                         string& kernel_name,
+                         const Shape& shapeA,
+                         const Shape& shapeB,
+                         const Shape& shapeZ)
+{
+    const size_t colrow = shapeA.back();
+    kernel_name = "do_3d_3d_mul";
+    writer << "__kernel void " << kernel_name << "(const __global float inputA"
+           << array_dims(shapeA) << ", const __global float inputB" << array_dims(shapeB)
+           << ", __global float output" << array_dims(shapeZ) << ")\n";
+    writer.block_begin();
+    {
+        size_t var_idx = 0;
+        // Main loops
+        for (auto const& i : shapeZ)
+        {
+            writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
+                   << var_idx << ")\n";
+            writer.block_begin();
+            ++var_idx;
+        }
+        // Inner loop
+        writer << "float sum = 0.0f;\n";
+        writer << "for (uint i4 = 0; i4 < " << colrow << "; ++i4)\n";
+        writer.block_begin();
+        {
+            writer << "sum += inputA[i0][i1][i4] * inputB[i4][i2][i3];\n";
+        }
+        writer.block_end();
+        writer << "output[i0][i1][i2][i3] = sum;\n";
+        // Closing brackets for main loops
+        for (auto const& i : shapeZ)
+        {
+            writer.block_end();
+        }
+    }
+    writer.block_end();
+}
+static void do_3d_2d_mul(codegen::CodeWriter& writer,
+                         string& kernel_name,
+                         const Shape& shapeA,
+                         const Shape& shapeB,
+                         const Shape& shapeZ)
+{
+    const size_t colrow = shapeA.back();
+    kernel_name = "do_3d_2d_mul";
+    writer << "__kernel void " << kernel_name << "(const __global float inputA"
+           << array_dims(shapeA) << ", const __global float inputB" << array_dims(shapeB)
+           << ", __global float output" << array_dims(shapeZ) << ")\n";
+    writer.block_begin();
+    {
+        size_t var_idx = 0;
+        // Main loops
+        for (auto const& i : shapeZ)
+        {
+            writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
+                   << var_idx << ")\n";
+            writer.block_begin();
+            ++var_idx;
+        }
+        // Inner loop
+        writer << "float sum = 0.0f;\n";
+        writer << "for (uint i3 = 0; i3 < " << colrow << "; ++i3)\n";
+        writer.block_begin();
+        {
+            writer << "sum += inputA[i0][i1][i3] * inputB[i3][i2];\n";
+        }
+        writer.block_end();
+        writer << "output[i0][i1][i2] = sum;\n";
+        // Closing brackets for main loops
+        for (auto const& i : shapeZ)
+        {
+            writer.block_end();
+        }
+    }
+    writer.block_end();
+}
+static void do_2d_1d_mul(codegen::CodeWriter& writer,
+                         string& kernel_name,
+                         const Shape& shapeA,
+                         const Shape& shapeB)
+{
+    const size_t rows = shapeA.at(0);
+    const size_t colrow = shapeA.at(1);
+    kernel_name = "do_2d_1d_mul";
+    writer << "__kernel void " << kernel_name << "(const __global float inputA"
+           << array_dims(shapeA) << ", const __global float inputB" << array_dims(shapeB)
+           << ", __global float output" << array_dims({rows}) << ")\n";
+    writer.block_begin();
+    {
+        writer << "for (uint i0 = 0; i0 < " << rows << "; ++i0)\n";
+        writer.block_begin();
+        {
+            writer << "float sum = 0.0f;\n";
+            writer << "for (uint i1 = 0; i1 < " << colrow << "; ++i1)\n";
+            writer.block_begin();
+            {
+                writer << "sum += inputA[i0][i1] * inputB[i1];\n";
+            }
+            writer.block_end();
+            writer << "output[i0] = sum;\n";
+        }
+        writer.block_end();
+    }
+    writer.block_end();
+}
+static void do_scalar_scalar_mul(codegen::CodeWriter& writer, string& kernel_name)
+{
+    kernel_name = "scalar_scalar_mul";
+    writer << "__kernel void " << kernel_name << "(const __global float inputA[1]"
+           << ", const __global float inputB[1], __global float output[1])\n";
+    writer.block_begin();
+    {
+        writer << "output[0] = inputA[0] * inputB[0];\n";
+    }
+    writer.block_end();
+}
+static void do_1d_1d_mul(codegen::CodeWriter& writer, string& kernel_name, const Shape& shape)
+{
+    if (shape.size() > 1)
+    {
+        throw invalid_argument("do_1d_1d_mul: Shape" + array_dims(shape) + " must be 1D");
+    }
+    const size_t& size = shape.front();
+    kernel_name = "do_1d_1d_mul";
+    writer << "__kernel void " << kernel_name << "(const __global float inputA" << array_dims(shape)
+           << ", const __global float inputB" << array_dims(shape)
+           << ", __global float output[1])\n";
+    writer.block_begin();
+    {
+        writer << "float sum = 0.0f;\n"
+               << "for (uint i = 0; i < " << size << "; ++i)\n";
+        writer.block_begin();
+        {
+            writer << "sum += inputA[i] * inputB[i];\n";
+        }
+        writer.block_end();
+        writer << "output[0] = sum;\n";
+    }
+    writer.block_end();
+}
+void runtime::intelgpu::do_dot_operation(cldnn::topology& topology,
+                                         const string& inputA_name,
+                                         const Shape& inputA_shape,
+                                         const string& inputB_name,
+                                         const Shape& inputB_shape,
+                                         const string& output_name,
+                                         const Shape& output_shape,
+                                         const element::Type& output_type)
+{
+    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
+    string entry_point_name = "dot_unknown";
+    codegen::CodeWriter writer;
+    const bool A_is_scalar = inputA_shape.empty();
+    const bool B_is_scalar = inputB_shape.empty();
+    const bool Z_is_scalar = output_shape.empty();
+    if (A_is_scalar && B_is_scalar && Z_is_scalar)
+    {
+        do_scalar_scalar_mul(writer, entry_point_name);
+    }
+    else if (((A_is_scalar && !B_is_scalar) || (!A_is_scalar && B_is_scalar)) && !Z_is_scalar)
+    {
+        do_1d_scalar_mul(writer, entry_point_name, inputA_shape, inputB_shape);
+    }
+    else if (!A_is_scalar && !B_is_scalar && Z_is_scalar)
+    {
+        do_1d_1d_mul(writer, entry_point_name, inputB_shape);
+    }
+    else if (!A_is_scalar && !B_is_scalar && !Z_is_scalar)
+    {
+        if (inputA_shape.size() == 2 && inputB_shape.size() == 1)
+        {
+            do_2d_1d_mul(writer, entry_point_name, inputA_shape, inputB_shape);
+        }
+        else if (inputA_shape.size() == 2 && inputB_shape.size() == 2)
+        {
+            do_2d_2d_mul(writer, entry_point_name, inputA_shape, inputB_shape);
+        }
+        else if (inputA_shape.size() == 3 && inputB_shape.size() == 3)
+        {
+            do_3d_3d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape);
+        }
+        else if (inputA_shape.size() == 3 && inputB_shape.size() == 2)
+        {
+            do_3d_2d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape);
+        }
+        else
+        {
+            do_dot_operation_error(inputA_shape, inputB_shape, output_shape);
+        }
+    }
+    else
+    {
+        do_dot_operation_error(inputA_shape, inputB_shape, output_shape);
+    }
+    //cout << writer.get_code() << endl;
+    const cldnn::custom_gpu_primitive op_dot(output_name,
+                                             {inputA_name, inputB_name},
+                                             {writer.get_code()},
+                                             entry_point_name,
+                                             parameters_2inp_1out,
+                                             "",
+                                             layout);
+    topology.add(op_dot);
+}
--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp
@@ -28,15 +28,24 @@ namespace ngraph
    {
        namespace intelgpu
        {
-            void do_pad_kernel(cldnn::topology& topology,
+            void do_pad_operation(cldnn::topology& topology,
-                               const std::string& input_name,
+                                  const std::string& input_name,
-                               const Shape& input_shape,
+                                  const Shape& input_shape,
-                               const std::string& scalar_name,
+                                  const std::string& scalar_name,
-                               const std::string& output_name,
+                                  const std::string& output_name,
-                               const Shape& output_shape,
+                                  const Shape& output_shape,
-                               const element::Type& output_type,
+                                  const element::Type& output_type,
-                               const Shape& pad_below,
+                                  const Shape& pad_below,
-                               const Shape& pad_interior);
+                                  const Shape& pad_interior);
+            void do_dot_operation(cldnn::topology& topology,
+                                  const std::string& inputA_name,
+                                  const Shape& inputA_shape,
+                                  const std::string& inputB_name,
+                                  const Shape& inputB_shape,
+                                  const std::string& output_name,
+                                  const Shape& output_shape,
+                                  const element::Type& output_type);
        }
    }
 }
--- a/src/ngraph/runtime/intelgpu/unit_test.manifest
+++ b/src/ngraph/runtime/intelgpu/unit_test.manifest
@@ -34,13 +34,11 @@ backwards_concat_axis_1
 backwards_concat_vector
 backwards_cos
 backwards_cosh
-backwards_dot_scalar_scalar
 backwards_dot_scalar_tensor
 backwards_dot_tensor2_tensor2
 backwards_dot_tensor3_tensor3
 backwards_dot_tensor_scalar
 backwards_dot_tensor_vector
-backwards_dot_vector_vector
 backwards_exp
 backwards_floor
 backwards_log
@@ -139,21 +137,12 @@ cos
 cosh
 divide_by_zero_int32
 dot_0_0
-dot1d
-dot2d
 dot_2x0_0
-dot3d_2d
-dot3d_3d
 dot_matrix_0x2_2x0
 dot_matrix_2x0_0x2
 dot_matrix_3x2_2x0
-dot_matrix_vector
-dot_matrix_vector_4_3
 dot_matrix_vector_int64
 dot_scalar_0x2
-dot_scalar_scalar
-dot_scalar_tensor_arg0
-dot_scalar_tensor_arg1
 equal
 exp
 floor
@@ -216,14 +205,9 @@ one_hot_vector_1_barely_oob
 one_hot_vector_1_far_oob
 one_hot_vector_1_fp
 one_hot_vector_1_fp_nonint
-pad_exterior_1d
 pad_exterior_2d_0x0
 pad_exterior_2d_0x3
 pad_exterior_2d_3x0
-pad_exterior_4d_1x2x2x2
-pad_interior_1d
-pad_interior_exterior_1d
-pad_interior_exterior_2d
 pad_interior_exterior_4d_2x0x3x2
 power
 product_3d_eliminate_zero_dim