IntelGPU backend: BatchNormBackprop operation (#1443)

* IntelGPU backend: BatchNormBackprop operation * PR1443. Requested refactoring done

IntelGPU backend: BatchNormBackprop operation (#1443)
* IntelGPU backend: BatchNormBackprop operation * PR1443. Requested refactoring done
da352aa1 · shssf · Scott Cyphers · 40ddf45a · da352aa1 · da352aa1
Commit da352aa1 authored Aug 21, 2018 by shssf Committed by Scott Cyphers Aug 21, 2018
5 changed files
--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -767,6 +767,45 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
                             pad_below,
                             pad_interior);
        }
+        else if ("BatchNormBackprop" == op->description())
+        {
+            arguments_check(op, 6, 3);
+
+            const shared_ptr<op::BatchNormBackprop> batch_norm =
+                static_pointer_cast<op::BatchNormBackprop>(op);
+            const double eps = batch_norm->get_eps_value();
+
+            do_create_mean(topology,
+                           get_output_name(op, 2), // d_beta
+                           get_output_type(op, 2),
+                           get_input_name(op, 5), // delta
+                           get_input_shape(op, 5),
+                           true);
+
+            do_create_variance_back(topology,
+                                    get_output_name(op, 1), // d_gamma
+                                    get_output_type(op, 1),
+                                    eps,
+                                    get_input_name(op, 2), // input
+                                    get_input_shape(op, 2),
+                                    get_input_name(op, 3),  // gamma
+                                    get_input_name(op, 4),  // beta
+                                    get_input_name(op, 5)); // delta
+
+            do_batch_norm_backprop_operation(topology,
+                                             get_input_shape(op, 2),
+                                             get_input_type(op, 2),
+                                             get_input_name(op, 0),
+                                             get_input_name(op, 1),
+                                             get_input_name(op, 2),
+                                             get_input_name(op, 3),
+                                             get_input_name(op, 4),
+                                             get_input_name(op, 5),
+                                             eps,
+                                             get_output_name(op, 0),
+                                             get_output_name(op, 1),
+                                             get_output_name(op, 2));
+        }
        else if ("BatchNorm" == op->description())
        {
            const shared_ptr<op::BatchNorm> batch_norm = static_pointer_cast<op::BatchNorm>(op);
@@ -788,14 +827,13 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)

                do_create_mean(topology,
                               mean_name,
-                               get_input_shape(op),
                               get_output_type(op),
                               get_input_name(op, 2),
-                               get_input_shape(op, 2));
+                               get_input_shape(op, 2),
+                               false);

                do_create_variance(topology,
                                   variance_name,
-                                   get_input_shape(op),
                                   get_output_type(op),
                                   get_input_name(op, 2),
                                   get_input_shape(op, 2),
@@ -814,13 +852,11 @@ bool runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)

                do_batch_norm_operation(topology,
                                        get_output_name(op),
-                                        get_output_shape(op),
                                        get_output_type(op),
                                        eps,
                                        get_input_name(op, 2),
                                        get_input_shape(op, 2),
                                        get_input_name(op, 0),
-                                        get_input_shape(op, 0),
                                        get_input_name(op, 1),
                                        mean_name,
                                        variance_name);

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_batchnorm.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_batchnorm.cpp
@@ -30,29 +30,39 @@
 using namespace std;
 using namespace ngraph;

+// According to the documentation, input data channel is always being axis 1
+// Assumed the second dimension from the left. Example {0, 1, 0, 0} or {0, 1}
+// Also, input data must be at least 2D array
+static const size_t channel_axis = 1;
+
+static Shape get_channel_shape(const Shape& shape, const string& function_name)
+{
+    if (shape.size() < channel_axis + 1)
+    {
+        const string err = "intelgpu::" + function_name + "() input_shape" +
+                           runtime::intelgpu::array_dims(shape) + " should be at least " +
+                           to_string(channel_axis + 1) + "D.";
+        throw invalid_argument(err);
+    }
+
+    return {shape.at(channel_axis)};
+}
+
 void runtime::intelgpu::do_create_mean(cldnn::topology& topology,
                                       const string& output_name,
-                                       const Shape& output_shape,
                                       const element::Type& output_type,
                                       const string& input_name,
-                                       const Shape& input_shape)
+                                       const Shape& input_shape,
+                                       bool backward)
 {
-    if (input_shape.size() < 2 || input_shape.size() > 4)
-    {
-        throw invalid_argument("intelgpu::do_create_mean_variance() wrong input shapes.");
-    }
-
-    // According to the documentation, input data channel is always being axis 1
-    // Assumed the second dimension from the left. Example {0, 1, 0, 0} or {0, 1}
-    // Also, input data must be at least 2D array
-    const size_t channel_axis = 1;
-    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
+    const Shape channel_shape = get_channel_shape(input_shape, "create_mean");
+    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, channel_shape);
    const string entry_point_name = "create_mean_" + output_name;
    const size_t output_counts = shape_size<Shape>(input_shape) / input_shape.at(channel_axis);
    codegen::CodeWriter writer;

    writer << "__kernel void " << entry_point_name << "( const __global float input"
-           << array_dims(input_shape) << ", __global float output" << array_dims(output_shape)
+           << array_dims(input_shape) << ", __global float output" << array_dims(channel_shape)
           << ")\n";

    writer.block_begin();
@@ -89,7 +99,12 @@ void runtime::intelgpu::do_create_mean(cldnn::topology& topology,
                }
                ++var_idx;
            }
-            writer << "output[i" << channel_axis << "]  = sum / " << output_counts << ";\n";
+            writer << "output[i" << channel_axis << "]  = sum";
+            if (!backward)
+            {
+                writer << " / " << output_counts;
+            }
+            writer << ";\n";

        } // Closing brackets for Channel axis loop
        writer.block_end();
@@ -110,29 +125,20 @@ void runtime::intelgpu::do_create_mean(cldnn::topology& topology,

 void runtime::intelgpu::do_create_variance(cldnn::topology& topology,
                                           const string& output_name,
-                                           const Shape& output_shape,
                                           const element::Type& output_type,
                                           const string& input_name,
                                           const Shape& input_shape,
                                           const std::string& mean_name)
 {
-    if (input_shape.size() < 2 || input_shape.size() > 4)
-    {
-        throw invalid_argument("intelgpu::do_create_mean_variance() wrong input shapes.");
-    }
-
-    // According to the documentation, input data channel is always being axis 1
-    // Assumed the second dimension from the left. Example {0, 1, 0, 0} or {0, 1}
-    // Also, input data must be at least 2D array
-    const size_t channel_axis = 1;
-    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
+    const Shape channel_shape = get_channel_shape(input_shape, "create_variance");
+    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, channel_shape);
    const string entry_point_name = "create_variance_" + output_name;
    const size_t output_counts = shape_size<Shape>(input_shape) / input_shape.at(channel_axis);
    codegen::CodeWriter writer;

    writer << "__kernel void " << entry_point_name << "( const __global float input"
-           << array_dims(input_shape) << ", const __global float mean" << array_dims(output_shape)
-           << ", __global float output" << array_dims(output_shape) << ")\n";
+           << array_dims(input_shape) << ", const __global float mean" << array_dims(channel_shape)
+           << ", __global float output" << array_dims(channel_shape) << ")\n";

    writer.block_begin();
    { // Main function body
@@ -193,52 +199,42 @@ void runtime::intelgpu::do_create_variance(cldnn::topology& topology,

 void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology,
                                                const string& output_name,
-                                                const Shape& output_shape,
                                                const element::Type& output_type,
                                                double eps,
                                                const string& input_name,
                                                const Shape& input_shape,
                                                const string& gamma_name,
-                                                const Shape& gamma_shape,
                                                const string& beta_name,
                                                const string& mean_name_inp,
                                                const string& variance_name_inp)
 {
-    if (input_shape.size() < 2 || input_shape.size() > 4)
-    {
-        throw invalid_argument("intelgpu::do_batch_norm_operation() wrong input shapes.");
-    }
-
-    // According to the documentation, input data channel is always being axis 1
-    // Assumed the second dimension from the left. Example {0, 1, 0, 0} or {0, 1}
-    // Also, input data must be at least 2D array
-    const size_t channel_axis = 1;
-    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
+    const Shape channel_shape = get_channel_shape(input_shape, "batch_norm");
+    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, input_shape);
    const string entry_point_name = "batch_norm_" + output_name;
    codegen::CodeWriter writer;
    vector<size_t> gws;

-    writer << "__kernel void " << entry_point_name << "( const __global float input"
-           << array_dims(input_shape) << ", const __global float gamma" << array_dims(gamma_shape)
-           << ", const __global float beta" << array_dims(gamma_shape)
-           << ", const __global float mean" << array_dims(gamma_shape)
-           << ", const __global float variance" << array_dims(gamma_shape)
-           << ", __global float output" << array_dims(output_shape) << ")\n";
+    writer << "__kernel void " << entry_point_name << "(const __global float input"
+           << array_dims(input_shape) << ", const __global float gamma" << array_dims(channel_shape)
+           << ", const __global float beta" << array_dims(channel_shape)
+           << ", const __global float mean" << array_dims(channel_shape)
+           << ", const __global float variance" << array_dims(channel_shape)
+           << ", __global float output" << array_dims(input_shape) << ")\n";

    writer.block_begin();
    { // Main function body

-        gws = generate_loops(writer, output_shape, true);
+        gws = generate_loops(writer, input_shape, true);

        writer << "float normalized = (input" << access_dims(input_shape) << " - mean[i"
               << channel_axis << "]) / ("
               << "sqrt(variance[i" << channel_axis << "] + " << eps << ")"
               << ");\n";

-        writer << "output" << access_dims(output_shape) << " = normalized * gamma[i" << channel_axis
+        writer << "output" << access_dims(input_shape) << " = normalized * gamma[i" << channel_axis
               << "] + beta[i" << channel_axis << "];\n";

-        generate_loops(writer, output_shape, false);
+        generate_loops(writer, input_shape, false);

    } // Main function body
    writer.block_end();
@@ -255,3 +251,154 @@ void runtime::intelgpu::do_batch_norm_operation(cldnn::topology& topology,
                                                    gws);
    topology.add(op_batch_norm);
 }
+
+void runtime::intelgpu::do_create_variance_back(cldnn::topology& topology,
+                                                const string& output_name,
+                                                const element::Type& output_type,
+                                                double eps,
+                                                const string& input_name,
+                                                const Shape& input_shape,
+                                                const string& mean_name,
+                                                const string& variance_name,
+                                                const string& delta_name)
+{
+    const Shape channel_shape = get_channel_shape(input_shape, "create_variance_back");
+    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, channel_shape);
+    const string entry_point_name = "create_variance_back_" + output_name;
+    codegen::CodeWriter writer;
+
+    writer << "__kernel void " << entry_point_name << "(const __global float input"
+           << array_dims(input_shape) << ", const __global float delta" << array_dims(input_shape)
+           << ", const __global float mean" << array_dims(channel_shape)
+           << ", const __global float variance" << array_dims(channel_shape)
+           << ", __global float output" << array_dims(channel_shape) << ")\n";
+
+    writer.block_begin();
+    { // Main function body
+
+        // Loop for Channel axis 1
+        writer << "for (uint i" << channel_axis << " = 0; i" << channel_axis << " < "
+               << input_shape.at(channel_axis) << "; ++i" << channel_axis << ")\n";
+        writer.block_begin();
+        {
+            writer << "float sum = 0.0f;\n";
+
+            size_t var_idx = 0;
+            // Main loops
+            for (auto const& i : input_shape)
+            {
+                if (var_idx != channel_axis)
+                {
+                    writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << i
+                           << "; ++i" << var_idx << ")\n";
+                    writer.block_begin();
+                }
+                ++var_idx;
+            }
+
+            writer << "float normalized = (input" << access_dims(input_shape) << " - mean[i"
+                   << channel_axis << "]) / ("
+                   << "sqrt(variance[i" << channel_axis << "] + " << eps << ")"
+                   << ");\n";
+
+            writer << "sum += normalized * delta" << access_dims(input_shape) << ";\n";
+
+            var_idx = 0;
+            // Closing brackets for main loops
+            for (auto const& i : input_shape)
+            {
+                if (var_idx != channel_axis)
+                {
+                    writer.block_end();
+                }
+                ++var_idx;
+            }
+
+            writer << "output[i" << channel_axis << "]  = sum;\n";
+
+        } // Closing brackets for Channel axis loop
+        writer.block_end();
+
+    } // Main function body
+    writer.block_end();
+
+    const vector<cldnn::primitive_id>& inputs = {input_name, delta_name, mean_name, variance_name};
+    const cldnn::custom_gpu_primitive op_create_variance_back(output_name,
+                                                              inputs,
+                                                              {writer.get_code()},
+                                                              entry_point_name,
+                                                              get_kernel_args(4, 1),
+                                                              "",
+                                                              layout,
+                                                              {1});
+    topology.add(op_create_variance_back);
+}
+
+void runtime::intelgpu::do_batch_norm_backprop_operation(cldnn::topology& topology,
+                                                         const Shape& shape,
+                                                         const element::Type& type,
+                                                         const string& gamma_name,
+                                                         const string& beta_name,
+                                                         const string& input_name,
+                                                         const string& mean_name,
+                                                         const string& variance_name,
+                                                         const string& delta_name,
+                                                         double eps,
+                                                         const string& output_name,
+                                                         const string& output_gamma_name,
+                                                         const string& output_beta_name)
+{
+    const Shape channel_shape = get_channel_shape(shape, "batch_norm_backprop");
+    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(type, shape);
+    const string entry_point_name = "batch_norm_backprop_" + output_name;
+    const size_t r_axes_size = shape_size(shape) / shape_size(channel_shape);
+    codegen::CodeWriter writer;
+    vector<size_t> gws;
+
+    writer << "__kernel void " << entry_point_name << "(const __global float input"
+           << array_dims(shape) << ", const __global float delta" << array_dims(shape)
+           << ", const __global float mean" << array_dims(channel_shape)
+           << ", const __global float variance" << array_dims(channel_shape)
+           << ", const __global float gamma" << array_dims(channel_shape)
+           << ", const __global float gamma_backprop" << array_dims(channel_shape)
+           << ", const __global float beta_backprop" << array_dims(channel_shape)
+           << ", __global float output" << array_dims(shape) << ")\n";
+
+    writer.block_begin();
+    { // Main function body
+
+        // Main loops
+        gws = runtime::intelgpu::generate_loops(writer, shape, true);
+
+        writer << "float stddev = sqrt(variance[i" << channel_axis << "] + " << eps << ");\n";
+        writer << "float xhat = (input" << access_dims(shape) << " - mean[i" << channel_axis
+               << "]) / stddev;\n";
+        writer << "float norma = gamma[i" << channel_axis << "] / stddev;\n";
+
+        writer << "output" << access_dims(shape) << " = norma * (delta" << access_dims(shape)
+               << " - (xhat * gamma_backprop[i" << channel_axis << "] + beta_backprop[i"
+               << channel_axis << "]) / " << r_axes_size << ");\n";
+
+        // Closing brackets for main loops
+        runtime::intelgpu::generate_loops(writer, shape, false);
+
+    } // Main function body
+    writer.block_end();
+
+    const vector<cldnn::primitive_id>& inputs = {input_name,
+                                                 delta_name,
+                                                 mean_name,
+                                                 variance_name,
+                                                 gamma_name,
+                                                 output_gamma_name,
+                                                 output_beta_name};
+    const cldnn::custom_gpu_primitive op_batch_norm_backprop(output_name,
+                                                             inputs,
+                                                             {writer.get_code()},
+                                                             entry_point_name,
+                                                             get_kernel_args(7, 1),
+                                                             "",
+                                                             layout,
+                                                             gws);
+    topology.add(op_batch_norm_backprop);
+}
--- a/src/ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp
@@ -31,13 +31,11 @@ namespace ngraph
            // nGraph uses channels in this operation but clDNN uses full input data
            void do_batch_norm_operation(cldnn::topology& topology,
                                         const std::string& output_name,
-                                         const Shape& output_shape,
                                         const element::Type& output_type,
                                         double eps,
                                         const std::string& input_name,
                                         const Shape& input_shape,
                                         const std::string& gamma_name,
-                                         const Shape& gamma_shape,
                                         const std::string& beta_name,
                                         const std::string& mean_name,
                                         const std::string& variance_name);
@@ -45,19 +43,46 @@ namespace ngraph
            // This creates mean of the input matrix by Channel axis
            void do_create_mean(cldnn::topology& topology,
                                const std::string& output_name,
-                                const Shape& output_shape,
                                const element::Type& output_type,
                                const std::string& input_name,
-                                const Shape& input_shape);
+                                const Shape& input_shape,
+                                bool backward);

-            // This creates mean of the input matrix by Channel axis
+            // This creates variance of the input matrix by Channel axis
            void do_create_variance(cldnn::topology& topology,
                                    const std::string& output_name,
-                                    const Shape& output_shape,
                                    const element::Type& output_type,
                                    const std::string& input_name,
                                    const Shape& input_shape,
                                    const std::string& mean_name);
+
+            // This creates variance backprop of the input matrix by Channel axis
+            void do_create_variance_back(cldnn::topology& topology,
+                                         const std::string& output_name,
+                                         const element::Type& output_type,
+                                         double eps,
+                                         const std::string& input_name,
+                                         const Shape& input_shape,
+                                         const std::string& mean_name,
+                                         const std::string& variance_name,
+                                         const std::string& delta_name);
+
+            // This function uses "shape" parameter as input or output Shape
+            // Shape of all other calculated as first axis from the left
+            // Example: output[ 4, 3, 2, 8 ] means out_gamma[ 3 ]
+            void do_batch_norm_backprop_operation(cldnn::topology& topology,
+                                                  const Shape& shape,
+                                                  const element::Type& type,
+                                                  const std::string& gamma_name,
+                                                  const std::string& beta_name,
+                                                  const std::string& input_name,
+                                                  const std::string& mean_name,
+                                                  const std::string& variance_name,
+                                                  const std::string& delta_name,
+                                                  double eps,
+                                                  const std::string& output_name,
+                                                  const std::string& output_gamma_name,
+                                                  const std::string& output_beta_name);
        }
    }
 }
--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
@@ -128,7 +128,8 @@ vector<size_t> runtime::intelgpu::generate_loops(codegen::CodeWriter& writer,
        {
            if (is_begin)
            {
-                writer << "const unsigned i" << var_idx << " = get_global_id(" << var_idx << ");\n";
+                writer << "const unsigned i" << var_idx << " = get_global_id(" << var_idx
+                       << "); /*trip count " << i << "*/\n";
                gws.push_back(i);
            }
        }

--- a/src/ngraph/runtime/intelgpu/unit_test.manifest
+++ b/src/ngraph/runtime/intelgpu/unit_test.manifest
@@ -28,7 +28,7 @@ backwards_sigmoid
 backwards_sign
 backwards_slice
 backwards_tan
-batchnorm_bprop_n4c3h2w2
+backwards_tanh
 batch_norm_one_output
 batch_norm_three_outputs
 broadcast_vector_rowwise_int64
@@ -108,7 +108,6 @@ zero_sized_maximum
 zero_sized_minimum
 zero_sized_multiply
 zero_sized_negative
-zero_sized_not
 zero_sized_not_equal
 zero_sized_power
 zero_sized_sign