IntelGPU backend: Max and Avg pool fix (#2482)

f8632ea0 · Sergey Shalnov · Sang Ik Lee · fd0ed37c · f8632ea0 · f8632ea0
Commit f8632ea0 authored Feb 23, 2019 by Sergey Shalnov Committed by Sang Ik Lee Feb 23, 2019
4 changed files
--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -765,14 +765,40 @@ shared_ptr<runtime::Executable>
        }
        case OP_TYPEID::MaxPool:
        {
+            arguments_check(op, 1, 1);
+
            const shared_ptr<op::MaxPool> max_pool = static_pointer_cast<op::MaxPool>(op);

-            do_pooling_operation(topology,
-                                 op,
-                                 max_pool->get_window_shape(),
-                                 max_pool->get_window_movement_strides(),
-                                 max_pool->get_padding_below(),
-                                 cldnn::pooling_mode::max);
+            if ((get_input_shape(op).size() > 4) || (get_output_type(op) != element::f32) ||
+                !max_pool->get_padding_below().empty() || !max_pool->get_padding_above().empty())
+            {
+                const shared_ptr<Node> def_val = max_pool->get_default_value();
+                const shared_ptr<op::Constant> def_const =
+                    static_pointer_cast<op::Constant>(def_val);
+                const vector<std::string>& values = def_const->get_value_strings();
+
+                do_max_avg_pool_operation(topology,
+                                          get_input_name(op),
+                                          get_input_shape(op),
+                                          get_output_name(op),
+                                          get_output_shape(op),
+                                          get_output_type(op),
+                                          max_pool->get_window_shape(),
+                                          max_pool->get_window_movement_strides(),
+                                          max_pool->get_padding_below(),
+                                          false,
+                                          values.at(0),
+                                          true);
+            }
+            else
+            {
+                do_pooling_operation(topology,
+                                     op,
+                                     max_pool->get_window_shape(),
+                                     max_pool->get_window_movement_strides(),
+                                     max_pool->get_padding_below(),
+                                     cldnn::pooling_mode::max);
+            }
            break;
        }
        case OP_TYPEID::MaxPoolBackprop:
@@ -804,17 +830,45 @@ shared_ptr<runtime::Executable>
        }
        case OP_TYPEID::AvgPool:
        {
+            arguments_check(op, 1, 1);
+
            const shared_ptr<op::AvgPool> avg_pool = static_pointer_cast<op::AvgPool>(op);
-            const cldnn::pooling_mode mode = avg_pool->get_include_padding_in_avg_computation()
-                                                 ? cldnn::pooling_mode::average
-                                                 : cldnn::pooling_mode::average_no_padding;

-            do_pooling_operation(topology,
-                                 op,
-                                 avg_pool->get_window_shape(),
-                                 avg_pool->get_window_movement_strides(),
-                                 avg_pool->get_padding_below(),
-                                 mode);
+            if ((get_input_shape(op).size() > 4) || (get_output_type(op) != element::f32) ||
+                avg_pool->get_include_padding_in_avg_computation() ||
+                !avg_pool->get_padding_below().empty() || !avg_pool->get_padding_above().empty())
+            {
+                const shared_ptr<Node> def_val = avg_pool->get_default_value();
+                const shared_ptr<op::Constant> def_const =
+                    static_pointer_cast<op::Constant>(def_val);
+                const vector<std::string>& values = def_const->get_value_strings();
+
+                do_max_avg_pool_operation(topology,
+                                          get_input_name(op),
+                                          get_input_shape(op),
+                                          get_output_name(op),
+                                          get_output_shape(op),
+                                          get_output_type(op),
+                                          avg_pool->get_window_shape(),
+                                          avg_pool->get_window_movement_strides(),
+                                          avg_pool->get_padding_below(),
+                                          avg_pool->get_include_padding_in_avg_computation(),
+                                          values.at(0),
+                                          false);
+            }
+            else
+            {
+                const cldnn::pooling_mode mode = avg_pool->get_include_padding_in_avg_computation()
+                                                     ? cldnn::pooling_mode::average
+                                                     : cldnn::pooling_mode::average_no_padding;
+
+                do_pooling_operation(topology,
+                                     op,
+                                     avg_pool->get_window_shape(),
+                                     avg_pool->get_window_movement_strides(),
+                                     avg_pool->get_padding_below(),
+                                     mode);
+            }
            break;
        }
        case OP_TYPEID::AvgPoolBackprop:
@@ -825,8 +879,8 @@ shared_ptr<runtime::Executable>
                static_pointer_cast<op::AvgPoolBackprop>(op);

            do_avg_pool_backprop_operation(topology,
-                                           get_input_name(op, 0),
-                                           get_input_shape(op, 0),
+                                           get_input_name(op),
+                                           get_input_shape(op),
                                           get_output_name(op),
                                           get_output_shape(op),
                                           get_output_type(op),

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
@@ -471,26 +471,31 @@ void runtime::intelgpu::do_max_pool_backprop_operation(cldnn::topology& topology
                                                       const Shape& pad_below)
 {
    const string entry_point_name = "op_max_pool_backprop_" + output_name;
+    const string type_name = get_opencl_type_name(output_type);
    const Shape delta_data(delta_shape.cbegin() + 2, delta_shape.cend());
    const Shape output_data(output_shape.cbegin() + 2, output_shape.cend());
    codegen::CodeWriter writer;
    vector<size_t> gws;

    // The kernel name and parameters
-    gen_func_def(
-        writer, entry_point_name, {2, "float"}, {input_shape, delta_shape}, "float", output_shape);
+    gen_func_def(writer,
+                 entry_point_name,
+                 {2, type_name},
+                 {input_shape, delta_shape},
+                 type_name,
+                 output_shape);

    writer.block_begin();
    {
        // Main loop over delta input array
        writer << "const uint i0 = get_global_id(0);";
        gws.push_back(delta_shape.at(0));
-        writer << "// for (uint i0 = 0; i0 < " << delta_shape.at(0) << "; ++i0)\n";
+        writer << "/*trip count " << delta_shape.at(0) << "*/\n";
        writer.block_begin();
        {
            writer << "const uint i1 = get_global_id(1);";
            gws.push_back(delta_shape.at(1));
-            writer << "// for (uint i1 = 0; i1 < " << delta_shape.at(1) << "; ++i1)\n";
+            writer << "/*trip count " << delta_shape.at(1) << "*/\n";
            writer.block_begin();
            {
                // Initialization output
@@ -509,7 +514,7 @@ void runtime::intelgpu::do_max_pool_backprop_operation(cldnn::topology& topology
                {
                    writer << "[j" << i << "]";
                }
-                writer << " = 0.0f;\n";
+                writer << " = 0;\n";

                // Closing brackets for Initialization loop
                for (auto const& i : output_data)
@@ -533,7 +538,9 @@ void runtime::intelgpu::do_max_pool_backprop_operation(cldnn::topology& topology
                {
                    writer << "uint save_i" << i + 2 << " = 0;\n";
                }
-                writer << "float max_elem = FLT_MIN;\n"
+                writer << type_name
+                       << " max_elem = " << get_opencl_type_min_max_value(output_type, true)
+                       << ";\n"
                       << "uint elem_exists = 0;\n";

                // Loop over window shape
@@ -541,7 +548,7 @@ void runtime::intelgpu::do_max_pool_backprop_operation(cldnn::topology& topology
                gen_window_loop(writer, output_shape, win_shape, win_stride, pad_below, true);

                {
-                    writer << "const float max_local = input0[i0][i1]";
+                    writer << "const " << type_name << " max_local = input0[i0][i1]";
                    // additional dimensions for input
                    for (size_t i = 0; i < win_shape.size(); ++i)
                    {
@@ -606,6 +613,169 @@ void runtime::intelgpu::do_max_pool_backprop_operation(cldnn::topology& topology
    topology.add(op_max_pool_backprop);
 }

+void runtime::intelgpu::do_max_avg_pool_operation(cldnn::topology& topology,
+                                                  const string& input_name,
+                                                  const Shape& input_shape,
+                                                  const string& output_name,
+                                                  const Shape& output_shape,
+                                                  const element::Type& output_type,
+                                                  const Shape& win_shape,
+                                                  const Shape& win_stride,
+                                                  const Shape& pad_below,
+                                                  bool include_padding,
+                                                  const string& def_val,
+                                                  bool is_max_pool)
+{
+    const string entry_point_name = "op_pool_" + to_string(is_max_pool) + "_" + output_name;
+    const string type_name = get_opencl_type_name(output_type);
+    const string init_accumulator = is_max_pool ? "-FLT_MAX" : def_val;
+    codegen::CodeWriter writer;
+    vector<size_t> gws;
+
+    const Shape input_data(input_shape.cbegin() + 2, input_shape.cend());
+    const Shape output_data(output_shape.cbegin() + 2, output_shape.cend());
+
+    // The kernel name and parameters
+    gen_func_def(writer, entry_point_name, {type_name}, {input_shape}, type_name, output_shape);
+
+    writer.block_begin();
+    { // Main function body
+
+        writer << "//Window:" << win_shape << " Stride: " << win_stride << "\n"
+               << "//padding included:" << include_padding << "\n"
+               << "//init value:" << def_val << "\n\n";
+
+        writer << "const uint N_dim = get_global_id(0);/*trip count " << input_shape.at(0)
+               << "*/\n";
+        gws.push_back(output_shape.at(0));
+        writer << "const uint C_dim = get_global_id(1);/*trip count " << input_shape.at(1)
+               << "*/\n";
+        gws.push_back(output_shape.at(1));
+
+        // Loops over output dimensions
+        size_t var_idx = 0;
+        for (auto i = output_data.begin(); i != output_data.end(); ++i)
+        {
+            writer << "for (uint i" << var_idx << " = 0; i" << var_idx << " < " << *i << "; ++i"
+                   << var_idx << ")\n";
+            writer.block_begin();
+
+            ++var_idx;
+        }
+
+        writer << type_name << " accumulator = " << init_accumulator << ";\n"
+               << "uint element_count = 0;\n\n";
+
+        // Loop over window
+        writer << "// Over window iterations\n";
+
+        var_idx = 0;
+        for (auto const i : win_shape)
+        {
+            writer << "for (uint f" << var_idx << " = 0; f" << var_idx << " < " << i << "; ++f"
+                   << var_idx << ")\n";
+            writer.block_begin();
+
+            writer << "uint input_idx" << var_idx << " = (i" << var_idx << " * "
+                   << win_stride.at(var_idx) << " /*win_stride*/"
+                   << ") + (f" << var_idx << ")"
+                   << " - " << pad_below.at(var_idx) << " /*pad_below*/;\n";
+            ++var_idx;
+        }
+
+        // Generate conditionals
+        writer << "if (";
+        var_idx = 0;
+        for (auto const& i : input_data)
+        {
+            if (var_idx)
+            {
+                writer << " && ";
+            }
+
+            writer << "(input_idx" << var_idx << " < " << i << ")";
+
+            ++var_idx;
+        }
+        writer << ")\n";
+        writer.block_begin();
+        {
+            // Output element calculation
+            if (is_max_pool)
+            {
+                writer << "accumulator = max(accumulator, input0[N_dim][C_dim]"
+                       << access_dims(win_shape, "input_idx") << ");\n";
+            }
+            else
+            {
+                writer << "accumulator += input0[N_dim][C_dim]"
+                       << access_dims(win_shape, "input_idx") << ";\n";
+            }
+            writer << "++element_count;\n";
+        }
+        writer.block_end();
+
+        if (include_padding)
+        {
+            writer << "else\n";
+            writer.block_begin();
+            {
+                // Output element calculation
+                writer << "accumulator += " << def_val << ";\n"
+                       << "++element_count;\n";
+            }
+            writer.block_end();
+        }
+
+        // End of conditional generation
+
+        // Closing brackets for window loop
+        for (auto const& i : win_shape)
+        {
+            writer.block_end();
+        }
+
+        writer << "\nif (element_count)\n";
+        writer.block_begin();
+        {
+            writer << "output[N_dim][C_dim]" << access_dims(output_data) << " = accumulator";
+            if (!is_max_pool)
+            {
+                writer << " / element_count";
+            }
+            writer << ";\n";
+        }
+        writer.block_end();
+
+        writer << "else\n";
+        writer.block_begin();
+        {
+            writer << "output[N_dim][C_dim]" << access_dims(output_data) << " = "
+                   << init_accumulator << ";\n";
+        }
+        writer.block_end();
+
+        // Closing brackets for output dimensions
+        for (const auto i : output_data)
+        {
+            writer.block_end();
+        }
+
+    } // Main function body
+    writer.block_end();
+
+    const cldnn::layout layout = IntelGPULayout::create_cldnn_layout(output_type, output_shape);
+    const cldnn::custom_gpu_primitive op_avg_pool(output_name,
+                                                  {input_name},
+                                                  {writer.get_code()},
+                                                  entry_point_name,
+                                                  get_kernel_args(1, 1),
+                                                  "",
+                                                  layout,
+                                                  gws);
+    topology.add(op_avg_pool);
+}
+
 void runtime::intelgpu::do_avg_pool_backprop_operation(cldnn::topology& topology,
                                                       const string& delta_name,
                                                       const Shape& delta_shape,
@@ -618,6 +788,7 @@ void runtime::intelgpu::do_avg_pool_backprop_operation(cldnn::topology& topology
                                                       const bool include_padding)
 {
    const string entry_point_name = "op_avg_pool_backprop_" + output_name;
+    const string type_name = get_opencl_type_name(output_type);
    codegen::CodeWriter writer;
    vector<size_t> gws;

@@ -627,22 +798,22 @@ void runtime::intelgpu::do_avg_pool_backprop_operation(cldnn::topology& topology
    size_t win_elems_size = shape_size<Shape>(win_shape);

    // The kernel name and parameters
-    gen_func_def(writer, entry_point_name, {"float"}, {delta_shape}, "float", output_shape);
+    gen_func_def(writer, entry_point_name, {type_name}, {delta_shape}, type_name, output_shape);

    writer.block_begin();
    {
        writer << "size_t win_elems_size = " << win_elems_size << ";\n";
-        writer << "float computed_val = 0.0f;\n";
+        writer << type_name << " computed_val = 0.0;\n";

        // Main loop over delta input array
        writer << "const uint i0 = get_global_id(0);";
        gws.push_back(delta_shape.at(0));
-        writer << "// for (uint i0 = 0; i0 < " << delta_shape.at(0) << "; ++i0)\n";
+        writer << "/*trip count " << delta_shape.at(0) << "*/\n";
        writer.block_begin();
        {
            writer << "const uint i1 = get_global_id(1);";
            gws.push_back(delta_shape.at(1));
-            writer << "// for (uint i1 = 0; i1 < " << delta_shape.at(1) << "; ++i1)\n";
+            writer << "/*trip count " << delta_shape.at(1) << "*/\n";
            writer.block_begin();
            {
                // Initialization output
@@ -661,7 +832,7 @@ void runtime::intelgpu::do_avg_pool_backprop_operation(cldnn::topology& topology
                {
                    writer << "[j" << i << "]";
                }
-                writer << " = 0.0f;\n";
+                writer << " = 0;\n";

                // Closing brackets for Initialization loop
                for (auto const& i : output_data)

--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp
@@ -58,6 +58,19 @@ namespace ngraph
                                                const Shape& win_stride,
                                                const Shape& pad_below);

+            void do_max_avg_pool_operation(cldnn::topology& topology,
+                                           const std::string& input_name,
+                                           const Shape& input_shape,
+                                           const std::string& output_name,
+                                           const Shape& output_shape,
+                                           const element::Type& output_type,
+                                           const Shape& win_shape,
+                                           const Shape& win_stride,
+                                           const Shape& pad_below,
+                                           bool include_padding,
+                                           const std::string& def_val,
+                                           bool is_max_pool);
+
            void do_avg_pool_backprop_operation(cldnn::topology& topology,
                                                const std::string& delta_name,
                                                const Shape& delta_shape,

--- a/src/ngraph/runtime/intelgpu/unit_test.manifest
+++ b/src/ngraph/runtime/intelgpu/unit_test.manifest
 all_2x2x3_eliminate_dims_0_1
-avg_pool_2d_2channel_2image_padded_only_above_do_not_include_in_computation
-avg_pool_2d_2channel_2image_padded_only_above_include_in_computation
-avg_pool_3d_uneven_strided_padded
 backwards_batch_norm_training
 backwards_dot_scalar_tensor
 backwards_dot_tensor_scalar
 backwards_dot_tensor_vector
-backwards_maxpool_n2_c1_hw5_3x3_str2_max
-backwards_maxpool_n4_c1_hw4_2x2_max
 backwards_replace_slice
 backwards_reverse_sequence_n3_c2_h3
 backwards_reverse_sequence_n4d2c3h2w2
@@ -18,7 +13,6 @@ embedding_lookup_10x1_arbitrary
 embedding_lookup_10x1_arbitrary_index_type_int
 embedding_lookup_4x5_reverse
 generate_mask
-max_pool_3d
 replace_slice_3d
 replace_slice_3d_strided
 replace_slice_3d_strided_different_strides