IntelGPU backend: BatchNorm operation completly redeveloped (#1318)

45b50d06 · shssf · Robert Kimball · 39278e7d · 45b50d06 · 45b50d06
Commit 45b50d06 authored Aug 03, 2018 by shssf Committed by Robert Kimball Aug 03, 2018
3 changed files
--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -533,35 +533,58 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
            }

            const string& output_name = op->get_outputs().begin()->get_tensor().get_name();
+            const Shape& output_shape = op->get_outputs().begin()->get_shape();
+            const element::Type& output_type =
+                op->get_outputs().begin()->get_tensor().get_element_type();
            const string& gamma_name = op->get_inputs().at(0).get_tensor().get_name();
+            const Shape& gamma_shape = op->get_inputs().at(0).get_shape();
            const string& beta_name = op->get_inputs().at(1).get_tensor().get_name();
            const string& input_name = op->get_inputs().at(2).get_tensor().get_name();
            const Shape& input_shape = op->get_inputs().at(2).get_shape();
+            string mean_name;
+            string variance_name;
+
+            if (op->get_outputs().size() == 3)
+            {
+                arguments_check(op, 3, 3);
+
+                mean_name = op->get_outputs().at(1).get_tensor().get_name();
+                variance_name = op->get_outputs().at(2).get_tensor().get_name();
+
+                do_create_mean(
+                    topology, mean_name, gamma_shape, output_type, input_name, input_shape);
+                do_create_variance(topology,
+                                   variance_name,
+                                   gamma_shape,
+                                   output_type,
+                                   input_name,
+                                   input_shape,
+                                   mean_name);
+            }

-            if (op->get_outputs().size() == 1)
+            if (op->get_outputs().size() == 1 || op->get_outputs().size() == 3)
            {
-                arguments_check(op, 5, 1);
+                if (mean_name.empty() || variance_name.empty())
+                {
+                    arguments_check(op, 5, 1);

-                const string& mean_name = op->get_inputs().at(3).get_tensor().get_name();
-                const string& variance_name = op->get_inputs().at(4).get_tensor().get_name();
+                    mean_name = op->get_inputs().at(3).get_tensor().get_name();
+                    variance_name = op->get_inputs().at(4).get_tensor().get_name();
+                }

                do_batch_norm_operation(topology,
                                        output_name,
+                                        output_shape,
+                                        output_type,
                                        eps,
                                        input_name,
                                        input_shape,
                                        gamma_name,
+                                        gamma_shape,
                                        beta_name,
                                        mean_name,
                                        variance_name);
            }
-            else if (op->get_outputs().size() == 3)
-            {
-                arguments_check(op, 3, 3);
-
-                do_batch_norm_operation(
-                    topology, output_name, eps, input_name, input_shape, gamma_name, beta_name);
-            }
            else
            {
                arguments_check(op, 5, 1); // throw exception in this case

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_batchnorm.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_batchnorm.cpp
@@ -16,9 +16,11 @@

 #include <CPP/batch_norm.hpp>
 #include <CPP/concatenation.hpp>
+#include <CPP/custom_gpu_primitive.hpp>
 #include <CPP/scale.hpp>
 #include <CPP/split.hpp>

+#include "ngraph/runtime/intelgpu/code_writer.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp"

@@ -27,109 +29,293 @@
 using namespace std;
 using namespace ngraph;

-static string do_matrix_split(cldnn::topology& topology,
-                              const string& name,
-                              const vector<pair<cldnn::primitive_id, cldnn::tensor>>& offsets)
+static vector<cldnn_arg> parameters_1inp_1out = {{arg_input, 0}, {arg_output, 0}};
+static vector<cldnn_arg> parameters_2inp_1out = {{arg_input, 0}, {arg_input, 1}, {arg_output, 0}};
+static vector<cldnn_arg> parameters_5inp_1out = {{arg_input, 0},
+                                                 {arg_input, 1},
+                                                 {arg_input, 2},
+                                                 {arg_input, 3},
+                                                 {arg_input, 4},
+                                                 {arg_output, 0}};
+
+static string array_dims(const Shape& dimentions)
 {
-    const string result = name + "_split";
+    string buffer;
+
+    for (auto const& dim : dimentions)
+    {
+        buffer += "[" + to_string(dim) + "]";
+    }

-    const cldnn::split op_split(result, name, offsets);
-    topology.add(op_split);
-    return result;
+    return buffer;
 }

-static string get_batch_norm_mean(cldnn::topology& topology, const string& input_name)
+static string access_dims(const Shape& dimentions, const AxisSet& axis = {})
 {
-    throw invalid_argument(
-        "intelgpu::get_batch_norm_mean() Calculation matrix mean is not yet supported.");
+    size_t var_idx = 0;
+    string buffer;
+
+    for (auto const& i : dimentions)
+    {
+        if (axis.find(var_idx) == axis.end())
+        {
+            buffer += "[i" + to_string(var_idx) + "]";
+        }
+        ++var_idx;
+    }
+
+    return buffer;
 }

-static string get_batch_norm_variance(cldnn::topology& topology,
-                                      const string& input_name,
-                                      const string& mean_name)
+void runtime::intelgpu::do_create_mean(cldnn::topology& topology,
+                                       const string& output_name,
+                                       const Shape& output_shape,
+                                       const element::Type& output_type,
+                                       const string& input_name,
+                                       const Shape& input_shape)
 {
-    throw invalid_argument(
-        "intelgpu::get_batch_norm_variance() Calculation matrix variance is not yet supported.");
+    if (input_shape.size() < 2 || input_shape.size() > 4)
+    {
+        throw invalid_argument("intelgpu::do_create_mean_variance() wrong input shapes.");
+    }
+
+    // According to the documentation, input data channel is always being axis 1
+    // Assumed the second dimension from the left. Example {0, 1, 0, 0} or {0, 1}
+    // Also, input data must be at least 2D array
+    const size_t channel_axis = 1;
+    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
+    const string entry_point_name = "create_mean_" + output_name;
+    const size_t output_counts = shape_size<Shape>(input_shape) / input_shape.at(channel_axis);
+    codegen::CodeWriter writer;
+
+    writer << "__kernel void " << entry_point_name << "( const __global float input"
+           << array_dims(input_shape) << ", __global float output" << array_dims(output_shape)
+           << ")\n";
+
+    writer.block_begin();
+    { // Main function body
+
+        // Loop for Channel axis 1
+        writer << "for (uint i" << channel_axis << " = 0; i" << channel_axis << " < "
+               << input_shape.at(channel_axis) << "; ++i" << channel_axis << ")\n";
+        writer.block_begin();
+        {
+            writer << "float sum = 0.0f;\n";
+            size_t var_idx = 0;
+            // Main loops
+            for (auto const& i : input_shape)
+            {
+                if (var_idx != channel_axis)
+                {
+                    writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i
+                           << "; ++i" << var_idx << ")\n";
+                    writer.block_begin();
+                }
+                ++var_idx;
+            }
+
+            writer << "sum += input" << access_dims(input_shape) << ";\n";
+
+            var_idx = 0;
+            // Closing brackets for main loops
+            for (auto const& i : input_shape)
+            {
+                if (var_idx != channel_axis)
+                {
+                    writer.block_end();
+                }
+                ++var_idx;
+            }
+            writer << "output[i" << channel_axis << "]  = sum / " << output_counts << ";\n";
+
+        } // Closing brackets for Channel axis loop
+        writer.block_end();
+
+    } // Main function body
+    writer.block_end();
+
+    const cldnn::custom_gpu_primitive op_mean(output_name,
+                                              {input_name},
+                                              {writer.get_code()},
+                                              entry_point_name,
+                                              parameters_1inp_1out,
+                                              "",
+                                              layout,
+                                              {1});
+    topology.add(op_mean);
+}
+
+void runtime::intelgpu::do_create_variance(cldnn::topology& topology,
+                                           const string& output_name,
+                                           const Shape& output_shape,
+                                           const element::Type& output_type,
+                                           const string& input_name,
+                                           const Shape& input_shape,
+                                           const std::string& mean_name)
+{
+    if (input_shape.size() < 2 || input_shape.size() > 4)
+    {
+        throw invalid_argument("intelgpu::do_create_mean_variance() wrong input shapes.");
+    }
+
+    // According to the documentation, input data channel is always being axis 1
+    // Assumed the second dimension from the left. Example {0, 1, 0, 0} or {0, 1}
+    // Also, input data must be at least 2D array
+    const size_t channel_axis = 1;
+    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
+    const string entry_point_name = "create_variance_" + output_name;
+    const size_t output_counts = shape_size<Shape>(input_shape) / input_shape.at(channel_axis);
+    codegen::CodeWriter writer;
+
+    writer << "__kernel void " << entry_point_name << "( const __global float input"
+           << array_dims(input_shape) << ", const __global float mean" << array_dims(output_shape)
+           << ", __global float output" << array_dims(output_shape) << ")\n";
+
+    writer.block_begin();
+    { // Main function body
+
+        // Loop for Channel axis 1
+        writer << "for (uint i" << channel_axis << " = 0; i" << channel_axis << " < "
+               << input_shape.at(channel_axis) << "; ++i" << channel_axis << ")\n";
+        writer.block_begin();
+        {
+            writer << "float sum = 0.0f;\n";
+
+            size_t var_idx = 0;
+            // Main loops
+            for (auto const& i : input_shape)
+            {
+                if (var_idx != channel_axis)
+                {
+                    writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i
+                           << "; ++i" << var_idx << ")\n";
+                    writer.block_begin();
+                }
+                ++var_idx;
+            }
+
+            writer << "float mean_diff = input" << access_dims(input_shape) << " - mean[i"
+                   << channel_axis << "];\n";
+            writer << "sum += mean_diff * mean_diff;\n";
+
+            var_idx = 0;
+            // Closing brackets for main loops
+            for (auto const& i : input_shape)
+            {
+                if (var_idx != channel_axis)
+                {
+                    writer.block_end();
+                }
+                ++var_idx;
+            }
+
+            writer << "output[i" << channel_axis << "]  = sum / " << output_counts << ";\n";
+
+        } // Closing brackets for Channel axis loop
+        writer.block_end();
+
+    } // Main function body
+    writer.block_end();
+
+    const cldnn::custom_gpu_primitive op_variance(output_name,
+                                                  {input_name, mean_name},
+                                                  {writer.get_code()},
+                                                  entry_point_name,
+                                                  parameters_2inp_1out,
+                                                  "",
+                                                  layout,
+                                                  {1});
+    topology.add(op_variance);
 }

 void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology,
                                                const string& output_name,
+                                                const Shape& output_shape,
+                                                const element::Type& output_type,
                                                double eps,
                                                const string& input_name,
                                                const Shape& input_shape,
                                                const string& gamma_name,
+                                                const Shape& gamma_shape,
                                                const string& beta_name,
                                                const string& mean_name_inp,
                                                const string& variance_name_inp)
 {
-    vector<pair<cldnn::primitive_id, cldnn::tensor>> split_offsets;
-    vector<pair<cldnn::primitive_id, cldnn::tensor>> vec_offsets;
-    vector<cldnn::primitive_id> dim_set;
-
    if (input_shape.size() < 2 || input_shape.size() > 4)
    {
-        throw invalid_argument("intelgpu::do_batch_norm_operation() wrong input shape.");
+        throw invalid_argument("intelgpu::do_batch_norm_operation() wrong input shapes.");
    }

    // According to the documentation, input data channel is always being axis 1
    // Assumed the second dimension from the left. Example {0, 1, 0, 0} or {0, 1}
    // Also, input data must be at least 2D array
-    const size_t shape_channel = 1;
-    const size_t cldnn_channel = 4 - input_shape.size() + shape_channel;
-    const cldnn::concatenation::concatenation_axis direction =
-        runtime::intelgpu::IntelGPULayout::get_cldnn_axis(cldnn_channel);
+    const size_t channel_axis = 1;
+    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
+    const string entry_point_name = "batch_norm_" + output_name;
+    codegen::CodeWriter writer;

-    const size_t split_arr_count = input_shape.at(shape_channel);
-    for (size_t i = 0; i < split_arr_count; ++i)
-    {
-        const string str_i = to_string(i);
-        const cldnn::tensor vec_offset(0, 0, i, 0);
-        vec_offsets.push_back(pair<cldnn::primitive_id, cldnn::tensor>(str_i, vec_offset));
+    writer << "__kernel void " << entry_point_name << "( const __global float input"
+           << array_dims(input_shape) << ", const __global float gamma" << array_dims(gamma_shape)
+           << ", const __global float beta" << array_dims(gamma_shape)
+           << ", const __global float mean" << array_dims(gamma_shape)
+           << ", const __global float variance" << array_dims(gamma_shape)
+           << ", __global float output" << array_dims(output_shape) << ")\n";

-        vector<cldnn::tensor::value_type> offset({0, 0, 0, 0}); // No action by default
-        offset.at(cldnn_channel) = i;
+    writer.block_begin();
+    { // Main function body

-        const cldnn::tensor input_offset(offset.at(0), offset.at(1), offset.at(3), offset.at(2));
-        split_offsets.push_back(pair<cldnn::primitive_id, cldnn::tensor>(str_i, input_offset));
-    }
+        // Loop for Channel axis 1
+        writer << "for (uint i" << channel_axis << " = 0; i" << channel_axis << " < "
+               << output_shape.at(channel_axis) << "; ++i" << channel_axis << ")\n";
+        writer.block_begin();
+        {
+            size_t var_idx = 0;
+            // Main loops
+            for (auto const& i : output_shape)
+            {
+                if (var_idx != channel_axis)
+                {
+                    writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i
+                           << "; ++i" << var_idx << ")\n";
+                    writer.block_begin();
+                }
+                ++var_idx;
+            }

-    string mean_name = mean_name_inp;
-    if (mean_name_inp.empty())
-    {
-        mean_name = get_batch_norm_mean(topology, input_name);
-    }
+            writer << "float normalized = (input" << access_dims(input_shape) << " - mean[i"
+                   << channel_axis << "]) / ("
+                   << "sqrt(variance[i" << channel_axis << "] + " << eps << ")"
+                   << ");\n";

-    string variance_name = variance_name_inp;
-    if (variance_name_inp.empty())
-    {
-        variance_name = get_batch_norm_variance(topology, input_name, mean_name);
-    }
-
-    const string input_split_name = do_matrix_split(topology, input_name, split_offsets);
-    const string mean_split_name = do_matrix_split(topology, mean_name, vec_offsets);
-    const string variance_split_name = do_matrix_split(topology, variance_name, vec_offsets);
-    const string gamma_split_name = do_matrix_split(topology, gamma_name, vec_offsets);
-    const string beta_split_name = do_matrix_split(topology, beta_name, vec_offsets);
+            writer << "output" << access_dims(output_shape) << " = normalized * gamma[i"
+                   << channel_axis << "] + beta[i" << channel_axis << "];\n";

-    for (size_t i = 0; i < split_arr_count; ++i)
-    {
-        const string suf = ':' + to_string(i);
-        const string out_bn_name = output_name + "_out_bn";
-
-        const cldnn::batch_norm cldd_batchnorm(out_bn_name + suf,
-                                               input_split_name + suf,
-                                               mean_split_name + suf,
-                                               variance_split_name + suf,
-                                               eps);
-        topology.add(cldd_batchnorm);
+            var_idx = 0;
+            // Closing brackets for main loops
+            for (auto const& i : output_shape)
+            {
+                if (var_idx != channel_axis)
+                {
+                    writer.block_end();
+                }
+                ++var_idx;
+            }

-        const cldnn::scale op_scale(
-            output_name + suf, out_bn_name + suf, gamma_split_name + suf, beta_split_name + suf);
-        topology.add(op_scale);
+        } // Closing brackets for Channel axis loop
+        writer.block_end();

-        dim_set.push_back(output_name + suf);
-    }
+    } // Main function body
+    writer.block_end();

-    const cldnn::concatenation op_concat(output_name, dim_set, direction);
-    topology.add(op_concat);
+    const vector<cldnn::primitive_id>& inputs = {
+        input_name, gamma_name, beta_name, mean_name_inp, variance_name_inp};
+    const cldnn::custom_gpu_primitive op_batch_norm(output_name,
+                                                    inputs,
+                                                    {writer.get_code()},
+                                                    entry_point_name,
+                                                    parameters_5inp_1out,
+                                                    "",
+                                                    layout,
+                                                    {1});
+    topology.add(op_batch_norm);
 }
--- a/src/ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp
@@ -19,6 +19,7 @@
 #include <CPP/topology.hpp>

 #include "ngraph/shape.hpp"
+#include "ngraph/type/element_type.hpp"

 namespace ngraph
 {
@@ -27,22 +28,36 @@ namespace ngraph
        namespace intelgpu
        {
            // This implements BatchNorm nGraph operation
-            // Since nGraph uses channels in this operation but clDNN uses full input data
-            // at one time we have to use following algorithm:
-            // 1. Split all input data arrays into several matrices by channel axis
-            // 2. Independently do cldnn::batch_norm on particular matrix
-            // 3. Every result of the cldnn::batch_norm must be scaled and
-            //    shifted because cldnn::batch_norm dosn't use gamma and beta
-            // 4. Concatenate all results into output matrix by channel axis
+            // nGraph uses channels in this operation but clDNN uses full input data
            void do_batch_norm_operation(cldnn::topology& topology,
                                         const std::string& output_name,
+                                         const Shape& output_shape,
+                                         const element::Type& output_type,
                                         double eps,
                                         const std::string& input_name,
                                         const Shape& input_shape,
                                         const std::string& gamma_name,
+                                         const Shape& gamma_shape,
                                         const std::string& beta_name,
-                                         const std::string& mean_name = std::string(),
-                                         const std::string& variance_name = std::string());
+                                         const std::string& mean_name,
+                                         const std::string& variance_name);
+
+            // This creates mean of the input matrix by Channel axis
+            void do_create_mean(cldnn::topology& topology,
+                                const std::string& output_name,
+                                const Shape& output_shape,
+                                const element::Type& output_type,
+                                const std::string& input_name,
+                                const Shape& input_shape);
+
+            // This creates mean of the input matrix by Channel axis
+            void do_create_variance(cldnn::topology& topology,
+                                    const std::string& output_name,
+                                    const Shape& output_shape,
+                                    const element::Type& output_type,
+                                    const std::string& input_name,
+                                    const Shape& input_shape,
+                                    const std::string& mean_name);
        }
    }
 }