IntelGPU backend: Gemm operation (#2904)

* add gemm operation * style apply * erase if statement * fix the test * enable tests

IntelGPU backend: Gemm operation (#2904)
* add gemm operation * style apply * erase if statement * fix the test * enable tests
3e0f87bf · Anna Alberska · Robert Kimball · c67892af · 3e0f87bf · 3e0f87bf
Commit 3e0f87bf authored May 13, 2019 by Anna Alberska Committed by Robert Kimball May 13, 2019
4 changed files
--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -805,6 +805,53 @@ shared_ptr<runtime::Executable>
            }
            break;
        }
+        case OP_TYPEID::Gemm:
+        {
+            arguments_check(op, 3, 1);
+            const shared_ptr<op::Gemm> gemm_op = static_pointer_cast<op::Gemm>(op);
+            const double alpha = gemm_op->get_alpha();
+            const double beta = gemm_op->get_beta();
+            const bool transA = gemm_op->get_transA();
+            const bool transB = gemm_op->get_transB();
+            if (op->get_input_element_type(0) == element::f32 &&
+                op->get_input_element_type(1) == element::f32 &&
+                op->get_input_element_type(2) == element::f32 &&
+                op->get_output_element_type(0) == element::f32)
+            {
+                const cldnn::gemm gemm_op(op->get_output_tensor_name(0),
+                                          op->get_input_tensor_name(0),
+                                          op->get_input_tensor_name(1),
+                                          op->get_input_tensor_name(2),
+                                          transA,
+                                          transB,
+                                          (float)alpha,
+                                          (float)beta);
+                topology.add(gemm_op);
+            }
+            else
+            {
+                if (alpha == 1.0 && beta == 0.0 && transA == false && transB == false)
+                {
+                    do_dot_operation(topology,
+                                     op->get_input_tensor_name(0),
+                                     op->get_input_shape(0),
+                                     op->get_input_tensor_name(1),
+                                     op->get_input_shape(1),
+                                     op->get_output_tensor_name(0),
+                                     op->get_output_shape(0),
+                                     op->get_output_element_type(0),
+                                     0);
+                }
+                else
+                {
+                    kern.emit<op::Gemm>(gemm_op);
+                }
+            }
+            break;
+        }
        case OP_TYPEID::MaxPool:
        {
            arguments_check(op, 1, 1);
@@ -1990,7 +2037,6 @@ shared_ptr<runtime::Executable>
        case OP_TYPEID::Erf:
        case OP_TYPEID::Gather:
        case OP_TYPEID::GatherND:
-        case OP_TYPEID::Gemm:
        case OP_TYPEID::GenerateMask:
        case OP_TYPEID::HardSigmoid:
        case OP_TYPEID::MVN:

--- a/src/ngraph/runtime/intelgpu/intelgpu_kernels.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_kernels.hpp
@@ -31,6 +31,7 @@
 #include "ngraph/op/convolution.hpp"
 #include "ngraph/op/equal.hpp"
 #include "ngraph/op/fused/conv_fused.hpp"
+#include "ngraph/op/fused/gemm.hpp"
 #include "ngraph/op/fused/group_conv.hpp"
 #include "ngraph/op/greater.hpp"
 #include "ngraph/op/greater_eq.hpp"
@@ -134,6 +135,7 @@ private:
    krnl_info build_krnl(const std::shared_ptr<op::ConvolutionBiasAdd>& op) const;
    krnl_info build_krnl(const std::shared_ptr<op::ConvolutionBiasBackpropFiltersBias>& op) const;
    krnl_info build_krnl(const std::shared_ptr<op::Equal>& op) const;
+    krnl_info build_krnl(const std::shared_ptr<op::Gemm>& op) const;
    krnl_info build_krnl(const std::shared_ptr<op::Greater>& op) const;
    krnl_info build_krnl(const std::shared_ptr<op::GreaterEq>& op) const;
    krnl_info build_krnl(const std::shared_ptr<op::Less>& op) const;

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
@@ -1044,6 +1044,111 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology,
    topology.add(op_dot);
 }
+CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::Gemm>& op) const
+{
+    const string& input0_name = op->get_input_tensor_name(0);
+    const Shape& input0_shape = op->get_input_shape(0);
+    const string& input1_name = op->get_input_tensor_name(1);
+    const Shape& input1_shape = op->get_input_shape(1);
+    const string& input2_name = op->get_input_tensor_name(2);
+    const Shape& input2_shape = op->get_input_shape(2);
+    const string& output_name = op->get_output_tensor_name(0);
+    const Shape& output_shape = op->get_output_shape(0);
+    const element::Type& output_type = op->get_output_element_type(0);
+    const double alpha = op->get_alpha();
+    const double beta = op->get_beta();
+    const bool transA = op->get_transA();
+    const bool transB = op->get_transB();
+    string entry_point_name = "gemm_" + output_name;
+    const string type_name = get_opencl_type_name(output_type);
+    CodeWriter writer;
+    vector<size_t> gws;
+    gen_func_def(writer,
+                 entry_point_name,
+                 {3, type_name},
+                 {input0_shape, input1_shape, input2_shape},
+                 type_name,
+                 output_shape);
+    writer.block_begin();
+    {
+        writer << type_name << " temp[" << output_shape.at(0) << "][" << output_shape.at(1)
+               << "];\n";
+        writer << "for(uint i0 = 0; i0 < " << output_shape.at(0) << "; ++i0)\n";
+        writer.block_begin();
+        {
+            writer << "for(uint i1 = 0; i1 < " << output_shape.at(1) << "; ++i1)\n";
+            writer.block_begin();
+            {
+                string input2_coords;
+                if (input2_shape.empty())
+                {
+                    input2_coords = "[0]";
+                }
+                else if (!input2_shape.empty() && input2_shape.size() == 1)
+                {
+                    input2_coords = "[i1]";
+                }
+                else
+                {
+                    input2_coords = "[i0][i1]";
+                }
+                writer << "temp[i0][i1] = input2" << input2_coords << " * " << beta << ";\n";
+            }
+            writer.block_end();
+        }
+        writer.block_end();
+        writer << "const uint i0 = get_global_id(0);";
+        gws.push_back(output_shape.at(0));
+        writer << "/*trip count " << output_shape.at(0) << "*/\n";
+        writer.block_begin();
+        {
+            writer << "const uint i1 = get_global_id(1);";
+            gws.push_back(output_shape.at(1));
+            writer << "/*trip count " << output_shape.at(1) << "*/\n";
+            writer.block_begin();
+            {
+                string acc;
+                if (type_name == "float")
+                {
+                    acc = "0.0f";
+                }
+                else
+                {
+                    acc = "0.0";
+                }
+                writer << type_name << " acc = " << acc << ";\n";
+                size_t k_coord = transA ? input0_shape.at(0) : input0_shape.at(1);
+                writer << "for (uint k=0; k < " << k_coord << "; ++k)\n";
+                writer.block_begin();
+                {
+                    string input0_coord = transA ? "[k][i0]" : "[i0][k]";
+                    string input1_coord = transB ? "[i1][k]" : "[k][i1]";
+                    writer << "acc += input0" << input0_coord << " * input1" << input1_coord
+                           << ";\n";
+                }
+                writer.block_end();
+                writer << "output[i0][i1] = acc * " << alpha << " + temp[i0][i1];\n";
+            }
+            writer.block_end();
+        }
+        writer.block_end();
+    }
+    writer.block_end();
+    const CustomKernelInfo krn_ret(output_name,
+                                   output_shape,
+                                   output_type,
+                                   {input0_name, input1_name, input2_name},
+                                   {writer.get_code()},
+                                   entry_point_name,
+                                   gws);
+    return {krn_ret};
+}
 CustomKernels::krnl_info CustomKernels::build_krnl(const shared_ptr<op::Slice>& op) const
 {
    const string& input_name = op->get_input_tensor_name(0);

--- a/src/ngraph/runtime/intelgpu/unit_test.manifest
+++ b/src/ngraph/runtime/intelgpu/unit_test.manifest
@@ -72,8 +72,6 @@ normalize_across_hw_w_scale
 normalize_invalid_input_tensor_rank
 normalize_invalid_scale_rank
 normalize
-gemm
-gemm_broadcast_input_C
 hardsigmoid
 mvn_mean_normalization
 mvn_mean_normalization_split_channels