IntelGPU backend: BatchNorm, Dot, Pad operations optimization (#1393)

134b0ae2 · shssf · Scott Cyphers · 9c1c5b59 · 134b0ae2 · 134b0ae2
Commit 134b0ae2 authored Aug 10, 2018 by shssf Committed by Scott Cyphers Aug 10, 2018
3 changed files
--- a/src/ngraph/runtime/intelgpu/intelgpu_op_batchnorm.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_batchnorm.cpp
@@ -216,6 +216,7 @@ void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology,
    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
    const string entry_point_name = "batch_norm_" + output_name;
    codegen::CodeWriter writer;
+    vector<size_t> gws;
    writer << "__kernel void " << entry_point_name << "( const __global float input"
           << array_dims(input_shape) << ", const __global float gamma" << array_dims(gamma_shape)
@@ -227,45 +228,17 @@ void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology,
    writer.block_begin();
    { // Main function body
-        // Loop for Channel axis 1
+        gws = generate_loops(writer, output_shape, true);
-        writer << "for (uint i" << channel_axis << " = 0; i" << channel_axis << " < "
-               << output_shape.at(channel_axis) << "; ++i" << channel_axis << ")\n";
-        writer.block_begin();
-        {
-            size_t var_idx = 0;
-            // Main loops
-            for (auto const& i : output_shape)
-            {
-                if (var_idx != channel_axis)
-                {
-                    writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i
-                           << "; ++i" << var_idx << ")\n";
-                    writer.block_begin();
-                }
-                ++var_idx;
-            }
        writer << "float normalized = (input" << access_dims(input_shape) << " - mean[i"
               << channel_axis << "]) / ("
               << "sqrt(variance[i" << channel_axis << "] + " << eps << ")"
               << ");\n";
-            writer << "output" << access_dims(output_shape) << " = normalized * gamma[i"
+        writer << "output" << access_dims(output_shape) << " = normalized * gamma[i" << channel_axis
-                   << channel_axis << "] + beta[i" << channel_axis << "];\n";
+               << "] + beta[i" << channel_axis << "];\n";
-            var_idx = 0;
-            // Closing brackets for main loops
-            for (auto const& i : output_shape)
-            {
-                if (var_idx != channel_axis)
-                {
-                    writer.block_end();
-                }
-                ++var_idx;
-            }
-        } // Closing brackets for Channel axis loop
+        generate_loops(writer, output_shape, false);
-        writer.block_end();
    } // Main function body
    writer.block_end();
@@ -279,6 +252,6 @@ void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology,
                                                    get_kernel_args(5, 1),
                                                    "",
                                                    layout,
-                                                    {1});
+                                                    gws);
    topology.add(op_batch_norm);
 }
--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
@@ -18,7 +18,6 @@
 #include <CPP/custom_gpu_primitive.hpp>
 #include <CPP/reshape.hpp>
-#include "ngraph/runtime/intelgpu/code_writer.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
@@ -88,7 +87,9 @@ string
    return buffer;
 }
-static vector<size_t> generate_loops(codegen::CodeWriter& writer, const Shape& shape, bool is_begin)
+vector<size_t> runtime::intelgpu::generate_loops(codegen::CodeWriter& writer,
+                                                 const Shape& shape,
+                                                 bool is_begin)
 {
    const size_t cldnn_gws_lim = 3;
    vector<size_t> gws;
@@ -170,6 +171,7 @@ void runtime::intelgpu::do_pad_operation(cldnn::topology& topology,
 {
    const string entry_point_name = "op_pad_kernel_" + output_name;
    codegen::CodeWriter writer;
+    vector<size_t> gws;
    // The kernel name and parameters
    writer << "__kernel void " << entry_point_name << "(const __global float input"
@@ -179,26 +181,16 @@ void runtime::intelgpu::do_pad_operation(cldnn::topology& topology,
    writer.block_begin();
    {
        // Loop for Broadcast scalar over full output tensor
-        size_t var_idx = 0;
+        gws = runtime::intelgpu::generate_loops(writer, output_shape, true);
-        for (auto const& i : output_shape)
-        {
-            writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
-                   << var_idx << ")\n";
-            writer.block_begin();
-            ++var_idx;
-        }
        writer << "output" << access_dims(output_shape) << " = scalar[0];\n";
        // Closing brackets for Broadcast loop
-        for (auto const& i : output_shape)
+        runtime::intelgpu::generate_loops(writer, output_shape, false);
-        {
-            writer.block_end();
-        }
        // Loop for Copy input matrix into output matrix with padding.
        // Padding include "pad_below" and "pad_interior" according nGraph documentation
-        var_idx = 0;
+        size_t var_idx = 0;
        for (auto const& i : input_shape)
        {
            writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
@@ -220,15 +212,15 @@ void runtime::intelgpu::do_pad_operation(cldnn::topology& topology,
    writer.block_end();
    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
-    const cldnn::custom_gpu_primitive op_scalar(output_name,
+    const cldnn::custom_gpu_primitive op_pad(output_name,
                                             {input_name, scalar_name},
                                             {writer.get_code()},
                                             entry_point_name,
                                             get_kernel_args(2, 1),
                                             "",
                                             layout,
-                                                {1});
+                                             gws);
-    topology.add(op_scalar);
+    topology.add(op_pad);
 }
 static void do_1d_scalar_mul(codegen::CodeWriter& writer,
@@ -256,7 +248,7 @@ static void do_1d_scalar_mul(codegen::CodeWriter& writer,
    writer.block_end();
 }
-static void do_2d_2d_mul(codegen::CodeWriter& writer,
+static vector<size_t> do_2d_2d_mul(codegen::CodeWriter& writer,
                                   string& kernel_name,
                                   const Shape& shapeA,
                                   const Shape& shapeB,
@@ -264,6 +256,7 @@ static void do_2d_2d_mul(codegen::CodeWriter& writer,
 {
    const size_t colrow = shapeA.at(1);
    kernel_name += "_do_2d_2d_mul";
+    vector<size_t> gws;
    writer << "__kernel void " << kernel_name << "(const __global float inputA"
           << runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB"
@@ -273,13 +266,7 @@ static void do_2d_2d_mul(codegen::CodeWriter& writer,
    {
        size_t var_idx = 0;
        // Main loops
-        for (auto const& i : shapeZ)
+        gws = runtime::intelgpu::generate_loops(writer, shapeZ, true);
-        {
-            writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
-                   << var_idx << ")\n";
-            writer.block_begin();
-            ++var_idx;
-        }
        // Inner loop
        writer << "float sum = 0.0f;\n";
@@ -292,15 +279,14 @@ static void do_2d_2d_mul(codegen::CodeWriter& writer,
        writer << "output[i0][i1] = sum;\n";
        // Closing brackets for main loops
-        for (auto const& i : shapeZ)
+        runtime::intelgpu::generate_loops(writer, shapeZ, false);
-        {
-            writer.block_end();
-        }
    }
    writer.block_end();
+    return gws;
 }
-static void do_3d_3d_mul(codegen::CodeWriter& writer,
+static vector<size_t> do_3d_3d_mul(codegen::CodeWriter& writer,
                                   string& kernel_name,
                                   const Shape& shapeA,
                                   const Shape& shapeB,
@@ -308,6 +294,7 @@ static void do_3d_3d_mul(codegen::CodeWriter& writer,
 {
    const size_t colrow = shapeA.back();
    kernel_name += "_do_3d_3d_mul";
+    vector<size_t> gws;
    writer << "__kernel void " << kernel_name << "(const __global float inputA"
           << runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB"
@@ -317,13 +304,7 @@ static void do_3d_3d_mul(codegen::CodeWriter& writer,
    {
        size_t var_idx = 0;
        // Main loops
-        for (auto const& i : shapeZ)
+        gws = runtime::intelgpu::generate_loops(writer, shapeZ, true);
-        {
-            writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
-                   << var_idx << ")\n";
-            writer.block_begin();
-            ++var_idx;
-        }
        // Inner loop
        writer << "float sum = 0.0f;\n";
@@ -336,15 +317,14 @@ static void do_3d_3d_mul(codegen::CodeWriter& writer,
        writer << "output[i0][i1][i2][i3] = sum;\n";
        // Closing brackets for main loops
-        for (auto const& i : shapeZ)
+        runtime::intelgpu::generate_loops(writer, shapeZ, false);
-        {
-            writer.block_end();
-        }
    }
    writer.block_end();
+    return gws;
 }
-static void do_3d_2d_mul(codegen::CodeWriter& writer,
+static vector<size_t> do_3d_2d_mul(codegen::CodeWriter& writer,
                                   string& kernel_name,
                                   const Shape& shapeA,
                                   const Shape& shapeB,
@@ -352,6 +332,7 @@ static void do_3d_2d_mul(codegen::CodeWriter& writer,
 {
    const size_t colrow = shapeA.back();
    kernel_name += "_do_3d_2d_mul";
+    vector<size_t> gws;
    writer << "__kernel void " << kernel_name << "(const __global float inputA"
           << runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB"
@@ -361,13 +342,7 @@ static void do_3d_2d_mul(codegen::CodeWriter& writer,
    {
        size_t var_idx = 0;
        // Main loops
-        for (auto const& i : shapeZ)
+        gws = runtime::intelgpu::generate_loops(writer, shapeZ, true);
-        {
-            writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i << "; ++i"
-                   << var_idx << ")\n";
-            writer.block_begin();
-            ++var_idx;
-        }
        // Inner loop
        writer << "float sum = 0.0f;\n";
@@ -380,33 +355,34 @@ static void do_3d_2d_mul(codegen::CodeWriter& writer,
        writer << "output[i0][i1][i2] = sum;\n";
        // Closing brackets for main loops
-        for (auto const& i : shapeZ)
+        runtime::intelgpu::generate_loops(writer, shapeZ, false);
-        {
-            writer.block_end();
-        }
    }
    writer.block_end();
+    return gws;
 }
-static void do_2d_1d_mul(codegen::CodeWriter& writer,
+static vector<size_t> do_2d_1d_mul(codegen::CodeWriter& writer,
                                   string& kernel_name,
                                   const Shape& shapeA,
-                         const Shape& shapeB)
+                                   const Shape& shapeB,
+                                   const Shape& shapeZ)
 {
-    const size_t rows = shapeA.at(0);
    const size_t colrow = shapeA.at(1);
    kernel_name += "_do_2d_1d_mul";
+    vector<size_t> gws;
    writer << "__kernel void " << kernel_name << "(const __global float inputA"
           << runtime::intelgpu::array_dims(shapeA) << ", const __global float inputB"
           << runtime::intelgpu::array_dims(shapeB) << ", __global float output"
-           << runtime::intelgpu::array_dims({rows}) << ")\n";
+           << runtime::intelgpu::array_dims(shapeZ) << ")\n";
-    writer.block_begin();
-    {
-        writer << "for (uint i0 = 0; i0 < " << rows << "; ++i0)\n";
    writer.block_begin();
    {
+        // Main loops
+        gws = runtime::intelgpu::generate_loops(writer, shapeZ, true);
        writer << "float sum = 0.0f;\n";
+        // Inner loop
        writer << "for (uint i1 = 0; i1 < " << colrow << "; ++i1)\n";
        writer.block_begin();
        {
@@ -414,10 +390,13 @@ static void do_2d_1d_mul(codegen::CodeWriter& writer,
        }
        writer.block_end();
        writer << "output[i0] = sum;\n";
+        // Closing brackets for main loops
+        runtime::intelgpu::generate_loops(writer, shapeZ, false);
    }
    writer.block_end();
-    }
-    writer.block_end();
+    return gws;
 }
 static void do_scalar_scalar_mul(codegen::CodeWriter& writer, string& kernel_name)
@@ -473,6 +452,7 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology,
    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
    string entry_point_name = "dot_" + output_name;
    codegen::CodeWriter writer;
+    vector<size_t> gws = {1};
    const bool A_is_scalar = inputA_shape.empty();
    const bool B_is_scalar = inputB_shape.empty();
@@ -494,19 +474,19 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology,
    {
        if (inputA_shape.size() == 2 && inputB_shape.size() == 1)
        {
-            do_2d_1d_mul(writer, entry_point_name, inputA_shape, inputB_shape);
+            gws = do_2d_1d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape);
        }
        else if (inputA_shape.size() == 2 && inputB_shape.size() == 2)
        {
-            do_2d_2d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape);
+            gws = do_2d_2d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape);
        }
        else if (inputA_shape.size() == 3 && inputB_shape.size() == 3)
        {
-            do_3d_3d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape);
+            gws = do_3d_3d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape);
        }
        else if (inputA_shape.size() == 3 && inputB_shape.size() == 2)
        {
-            do_3d_2d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape);
+            gws = do_3d_2d_mul(writer, entry_point_name, inputA_shape, inputB_shape, output_shape);
        }
        else
        {
@@ -518,7 +498,6 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology,
        do_dot_operation_error(inputA_shape, inputB_shape, output_shape);
    }
-    //cout << writer.get_code() << endl;
    const cldnn::custom_gpu_primitive op_dot(output_name,
                                             {inputA_name, inputB_name},
                                             {writer.get_code()},
@@ -526,7 +505,7 @@ void runtime::intelgpu::do_dot_operation(cldnn::topology& topology,
                                             get_kernel_args(2, 1),
                                             "",
                                             layout,
-                                             {1});
+                                             gws);
    topology.add(op_dot);
 }

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp
@@ -18,6 +18,8 @@
 #include <CPP/topology.hpp>
+#include "ngraph/runtime/intelgpu/code_writer.hpp"
 #include "ngraph/axis_set.hpp"
 #include "ngraph/coordinate.hpp"
 #include "ngraph/shape.hpp"
@@ -96,6 +98,8 @@ namespace ngraph
            std::string access_dims(const Shape& dimentions,
                                    const AxisSet& axis = {},
                                    bool is_reversed = false);
+            std::vector<size_t>
+                generate_loops(codegen::CodeWriter& writer, const Shape& shape, bool is_begin);
        }
    }
 }