IAT: More convolution folding optimizations (#1712)

* Check output shape when setting memory layout for slice op. * Miscellaneous fusion and other optimizations for inception-resnetv2 - ConvBias Batchnorm folding - ConvBias Affine folding - Check if MKLDNN can slice a given layout and select layouts appropriately * Fixed unit test and bug in conv bias pattern * Addressed PR feedback * Addressed PR feedback

IAT: More convolution folding optimizations (#1712)
* Check output shape when setting memory layout for slice op. * Miscellaneous fusion and other optimizations for inception-resnetv2 - ConvBias Batchnorm folding - ConvBias Affine folding - Check if MKLDNN can slice a given layout and select layouts appropriately * Fixed unit test and bug in conv bias pattern * Addressed PR feedback * Addressed PR feedback
00b4453d · Jayaram Bobba · Robert Kimball · c6bb0cf4 · 00b4453d · 00b4453d
Commit 00b4453d authored Oct 08, 2018 by Jayaram Bobba Committed by Robert Kimball Oct 08, 2018
4 changed files
--- a/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
@@ -741,7 +741,6 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_batch_norm_relu()
    auto beta_shape = Shape{2};
    auto beta = std::make_shared<pattern::op::Label>(element::f32, beta_shape);
    double eps = 0.001;
-    auto shape_r = Shape{1, 2, 2, 2};
    auto bn = std::make_shared<op::BatchNorm>(eps, gamma, beta, input);
    auto goe = std::make_shared<op::GetOutputElement>(bn, 0);
    auto prelu = std::make_shared<op::Relu>(goe);
@@ -811,7 +810,6 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_batch_norm_relu_global_sta
    auto beta_shape = Shape{2};
    auto beta = std::make_shared<pattern::op::Label>(element::f32, beta_shape);
    double eps = 0.001;
-    auto shape_r = Shape{1, 2, 2, 2};
    auto bn = std::make_shared<op::BatchNorm>(eps, gamma, beta, input, mean, var);
    auto prelu = std::make_shared<op::Relu>(bn);

@@ -1403,3 +1401,210 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_bounded_relu()
    auto m = std::make_shared<pattern::Matcher>(min, callback);
    this->add_matcher(m);
 }
+
+void ngraph::runtime::cpu::pass::CPUFusion::construct_conv_bias_folded_batch_norm()
+{
+    auto input = std::make_shared<pattern::op::Label>(element::f32, Shape{2, 2, 1, 1});
+    auto filters = std::make_shared<pattern::op::Label>(element::f32, Shape{2, 2, 1, 1});
+    auto bias = std::make_shared<pattern::op::Label>(element::f32, Shape{2});
+
+    auto pconv = std::make_shared<op::ConvolutionBias>(input,
+                                                       filters,
+                                                       bias,
+                                                       Strides{1, 1},
+                                                       Strides{1, 1},
+                                                       CoordinateDiff{0, 0},
+                                                       CoordinateDiff{0, 0},
+                                                       Strides{1, 1});
+
+    auto mean = std::make_shared<pattern::op::Label>(element::f32, Shape{2});
+    auto var = std::make_shared<pattern::op::Label>(element::f32, Shape{2});
+    auto gamma = std::make_shared<pattern::op::Label>(element::f32, Shape{2});
+    auto beta = std::make_shared<pattern::op::Label>(element::f32, Shape{2});
+    double eps = 0.001;
+    auto bn = std::make_shared<op::BatchNorm>(eps, gamma, beta, pconv, mean, var);
+
+    ngraph::pattern::graph_rewrite_callback callback =
+        [input, filters, bias, mean, var, gamma, beta](pattern::Matcher& m) {
+            NGRAPH_DEBUG << "In callback for folded batch norm against node = "
+                         << m.get_match_root()->get_name();
+            auto pattern_map = m.get_pattern_map();
+
+            auto m_bn = std::dynamic_pointer_cast<op::BatchNorm>(m.get_match_root());
+            auto m_conv = std::dynamic_pointer_cast<op::ConvolutionBias>(m_bn->get_argument(2));
+
+            if (m_conv->get_users().size() > 1)
+            {
+                return false;
+            }
+
+            if (m_conv->get_shape().size() != 4)
+            {
+                return false;
+            }
+
+            // new weights = old weights * gamma / sqrt(variance + epsilon)
+            // new biases = (old_bias-mean) * gamma / sqrt(variance + epsilon) + beta
+
+            auto bn_eps = op::Constant::create(element::f32, Shape{}, {m_bn->get_eps_value()});
+            auto var_eps = std::make_shared<op::Add>(
+                pattern_map[var],
+                std::make_shared<op::Broadcast>(bn_eps, pattern_map[var]->get_shape(), AxisSet{0}));
+            auto sqrt_var_eps = std::make_shared<op::Sqrt>(var_eps);
+
+            auto mean_gamma = std::make_shared<op::Multiply>(
+                std::make_shared<op::Subtract>(pattern_map[bias], pattern_map[mean]),
+                pattern_map[gamma]);
+            auto new_biases = std::make_shared<op::Add>(
+                pattern_map[beta], std::make_shared<op::Divide>(mean_gamma, sqrt_var_eps));
+            auto weight_scaling = std::make_shared<op::Divide>(pattern_map[gamma], sqrt_var_eps);
+            auto new_weights = std::make_shared<op::Multiply>(
+                pattern_map[filters],
+                std::make_shared<op::Broadcast>(
+                    weight_scaling, pattern_map[filters]->get_shape(), AxisSet{1, 2, 3}));
+
+            auto conv_bias =
+                std::make_shared<op::ConvolutionBias>(pattern_map[input],
+                                                      new_weights,
+                                                      new_biases,
+                                                      m_conv->get_window_movement_strides(),
+                                                      m_conv->get_window_dilation_strides(),
+                                                      m_conv->get_padding_below(),
+                                                      m_conv->get_padding_above(),
+                                                      m_conv->get_data_dilation_strides());
+            ngraph::replace_node(m.get_match_root(), conv_bias);
+
+            return true;
+
+        };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(bn, callback);
+    this->add_matcher(m);
+}
+
+void ngraph::runtime::cpu::pass::CPUFusion::construct_conv_bias_affine_folding()
+{
+    // A * ConvBias (input, filters, bias) + B -> ConvBias (input, filters * A_c)
+    Shape shape{2, 2, 1, 1};
+    auto input = std::make_shared<pattern::op::Label>(element::f32, shape);
+    auto filters = std::make_shared<pattern::op::Label>(element::f32, shape);
+    auto bias = std::make_shared<pattern::op::Label>(element::f32, Shape{2});
+
+    auto conv = std::make_shared<op::ConvolutionBias>(input,
+                                                      filters,
+                                                      bias,
+                                                      Strides{1, 1},
+                                                      Strides{1, 1},
+                                                      CoordinateDiff{0, 0},
+                                                      CoordinateDiff{0, 0},
+                                                      Strides{1, 1});
+    auto conv_label = std::make_shared<pattern::op::Label>(conv, nullptr, NodeVector{conv});
+
+    auto Ac = std::make_shared<pattern::op::Label>(element::f32, Shape{2});
+    auto A = std::make_shared<op::Broadcast>(Ac, Shape{2, 2, 1, 1}, AxisSet{0, 2, 3});
+    auto A_label = std::make_shared<pattern::op::Label>(A, nullptr, NodeVector{A});
+    auto multiply = std::make_shared<op::Multiply>(conv_label, A_label);
+
+    ngraph::pattern::graph_rewrite_callback callback = [input, filters, bias, conv_label, A_label](
+        pattern::Matcher& m) {
+        NGRAPH_DEBUG << "In callback for conv affine folding against node = "
+                     << m.get_match_root()->get_name();
+        auto pattern_map = m.get_pattern_map();
+
+        auto conv_m = std::static_pointer_cast<op::ConvolutionBias>(pattern_map[conv_label]);
+
+        if (conv_m->get_users().size() > 1)
+        {
+            return false;
+        }
+
+        if (conv_m->get_shape().size() != 4)
+        {
+            return false;
+        }
+
+        if (conv_m->with_relu())
+        {
+            return false;
+        }
+
+        auto A_m = std::static_pointer_cast<op::Broadcast>(pattern_map[A_label]);
+
+        // Check if values are being broadcast along channel (2nd) dimension
+        auto is_channel_bcast = [](const std::shared_ptr<op::Broadcast>& bcast) {
+
+            if (bcast->get_argument(0)->get_shape().size() == 0)
+            {
+                return true;
+            }
+
+            if (bcast->get_argument(0)->get_shape().size() == 1 &&
+                bcast->get_broadcast_axes() == AxisSet{0, 2, 3})
+            {
+                return true;
+            }
+
+            if (bcast->get_argument(0)->get_shape().size() == 2)
+            {
+                auto input_shape = bcast->get_argument(0)->get_shape();
+                if (input_shape[0] == 1 && bcast->get_broadcast_axes() == AxisSet{2, 3})
+                    return true;
+            }
+            return false;
+        };
+
+        if (!is_channel_bcast(A_m))
+        {
+            return false;
+        }
+
+        auto get_bcast_input = [](const std::shared_ptr<op::Broadcast>& bcast) {
+            if (bcast->get_argument(0)->get_shape().size() == 0)
+            {
+                Shape bshape{bcast->get_shape()[1]};
+                return std::static_pointer_cast<ngraph::Node>(
+                    std::make_shared<op::Broadcast>(bcast->get_argument(0), bshape, AxisSet{0}));
+            }
+            if (bcast->get_argument(0)->get_shape().size() == 1)
+            {
+                return bcast->get_argument(0);
+            }
+            if (bcast->get_argument(0)->get_shape().size() == 2)
+            {
+                Shape bshape{bcast->get_argument(0)->get_shape()[1]};
+                return std::static_pointer_cast<ngraph::Node>(std::make_shared<op::Reshape>(
+                    bcast->get_argument(0), AxisVector{0, 1}, bshape));
+            }
+            throw ngraph_error("Unexpected shape for bcast input");
+        };
+
+        auto Ac_m = get_bcast_input(A_m);
+
+        // new weights = old weights * Ac_m
+        // new_bias = old_bias * Ac_m;
+
+        auto filters_n = std::make_shared<op::Multiply>(
+            pattern_map[filters],
+            std::make_shared<op::Broadcast>(
+                Ac_m, pattern_map[filters]->get_shape(), AxisSet{1, 2, 3}));
+
+        auto bias_n = std::make_shared<op::Multiply>(pattern_map[bias], Ac_m);
+
+        auto convbias_n =
+            std::make_shared<op::ConvolutionBias>(pattern_map[input],
+                                                  filters_n,
+                                                  bias_n,
+                                                  conv_m->get_window_movement_strides(),
+                                                  conv_m->get_window_dilation_strides(),
+                                                  conv_m->get_padding_below(),
+                                                  conv_m->get_padding_above(),
+                                                  conv_m->get_data_dilation_strides());
+        ngraph::replace_node(m.get_match_root(), convbias_n);
+
+        return true;
+
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(multiply, callback);
+    this->add_matcher(m);
+}
--- a/src/ngraph/runtime/cpu/pass/cpu_fusion.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_fusion.hpp
@@ -64,6 +64,8 @@ public:
            construct_zero_padded_conv();
            construct_zero_padded_conv_backprop_filters();
            construct_conv_bias_bprop();
+            construct_conv_bias_folded_batch_norm();
+            construct_conv_bias_affine_folding();
            construct_batch_norm_relu();
            construct_batch_norm_relu_global_stats();
            construct_conv_relu();
@@ -96,4 +98,6 @@ private:
    void construct_conv_add();
    void construct_conv_add_relu();
    void construct_bounded_relu();
+    void construct_conv_bias_folded_batch_norm();
+    void construct_conv_bias_affine_folding();
 };
--- a/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
@@ -1671,60 +1671,40 @@ namespace ngraph
                {
                    if (mkldnn_utils::use_mkldnn_kernel(node.get()))
                    {
-                        // pass input format to output
+                        const ngraph::op::Slice* slice =
+                            static_cast<const ngraph::op::Slice*>(node.get());
+                        auto lower_bounds = slice->get_lower_bounds();
+                        auto result_shape = slice->get_output_shape(0);
+
                        auto input_md = mkldnn_utils::get_input_mkldnn_md(node.get(), 0);
-                        NGRAPH_DEBUG << "input memory format: " << input_md.data.format << "\n";
-                        auto result_format =
-                            static_cast<mkldnn::memory::format>(input_md.data.format);
+                        auto input_pd = mkldnn::memory::primitive_desc(
+                            input_md, runtime::cpu::mkldnn_utils::global_cpu_engine);
+                        auto dims = mkldnn::memory::dims(result_shape.begin(), result_shape.end());
+                        auto offsets =
+                            mkldnn::memory::dims(lower_bounds.begin(), lower_bounds.end());

-                        auto slice = static_cast<ngraph::op::Slice*>(node.get());
-                        auto lower_bounds = slice->get_lower_bounds();
-                        if (result_format == mkldnn::memory::nChw16c)
+                        try
                        {
-                            // check lower bound of channels
-                            if (lower_bounds[1] % 16 != 0)
-                            {
-                                NGRAPH_DEBUG
-                                    << "slice nChw16c: lower bound of channels not multiple of 16, "
-                                       "set native layout\n";
-                                set_native_layouts(external_function, node);
-                                return;
-                            }
+                            // MKLDNN currently doesn't support views for blocked layouts
+                            // when the dims and offsets are not divisible by the block size
+                            auto view_md = mkldnn::view::primitive_desc(input_pd, dims, offsets)
+                                               .dst_primitive_desc()
+                                               .desc();
+                            vector<memory::desc> o_mds;
+                            o_mds.push_back(view_md);
+                            set_output_layouts(node, o_mds);
                        }
-                        else if (result_format == mkldnn::memory::nChw8c)
+                        catch (const mkldnn::error& e)
                        {
-                            // check lower bound of channels
-                            if (lower_bounds[1] % 8 != 0)
+                            if (e.status == mkldnn_unimplemented)
                            {
-                                NGRAPH_DEBUG
-                                    << "slice nChw8C: lower bound of channels not multiple of 8,"
-                                       "set native layout\n";
                                set_native_layouts(external_function, node);
-                                return;
+                            }
+                            else
+                            {
+                                throw ngraph_error(e.message);
                            }
                        }
-
-                        vector<memory::desc> o_mds;
-                        if (result_format == mkldnn::memory::blocked)
-                        {
-                            auto cpu_tvl = dynamic_pointer_cast<runtime::cpu::LayoutDescriptor>(
-                                node->get_inputs()[0]
-                                    .get_output()
-                                    .get_tensor_ptr()
-                                    ->get_tensor_layout());
-                            auto result_desc =
-                                mkldnn_utils::create_blocked_mkldnn_md(node->get_output_shape(0),
-                                                                       cpu_tvl->get_strides(),
-                                                                       node->get_element_type());
-                            o_mds.push_back(result_desc);
-                        }
-                        else
-                        {
-                            auto result_desc = mkldnn_utils::create_default_mkldnn_md(
-                                node.get(), 0, true, result_format);
-                            o_mds.push_back(result_desc);
-                        }
-                        set_output_layouts(node, o_mds);
                    }
                    else
                    {

--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp
@@ -1612,7 +1612,7 @@ TEST(cpu_fusion, maxpool_with_indices_in_mxnet)
    ASSERT_TRUE(std::dynamic_pointer_cast<op::Parameter>(mpwi_bprop->get_argument(2)));
 }

-TEST(cpu_fusion, batch_norm_folding)
+TEST(cpu_fusion, conv_batch_norm_folding)
 {
    Shape shape_input{1, 8, 3, 3};
    Shape shape_weights{2, 8, 1, 1};
@@ -1671,7 +1671,48 @@ TEST(cpu_fusion, batch_norm_folding)
    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
 }

-TEST(cpu_fusion, affine_folding)
+TEST(cpu_fusion, convbias_batch_norm_folding)
+{
+    Shape shape_input{2, 8, 5, 5};
+    Shape shape_weights{2, 8, 2, 2};
+    Shape shape_norm{2};
+
+    auto make_function = [shape_input, shape_weights, shape_norm]() {
+        auto input = std::make_shared<op::Parameter>(element::f32, shape_input);
+        auto weights = std::make_shared<op::Parameter>(element::f32, shape_weights);
+        auto bias = std::make_shared<op::Parameter>(element::f32, Shape{2});
+        double eps = 1.01;
+        auto gamma = std::make_shared<op::Parameter>(element::f32, shape_norm);
+        auto beta = std::make_shared<op::Parameter>(element::f32, shape_norm);
+        auto mean = std::make_shared<op::Parameter>(element::f32, shape_norm);
+        auto var = std::make_shared<op::Parameter>(element::f32, shape_norm);
+        auto conv = std::make_shared<op::Convolution>(input, weights, Strides{1, 1}, Strides{1, 1});
+        auto convbias =
+            conv + std::make_shared<op::Broadcast>(bias, conv->get_shape(), AxisSet{0, 2, 3});
+        auto bn = std::make_shared<op::BatchNorm>(eps, gamma, beta, convbias, mean, var);
+        auto f = make_shared<Function>(
+            NodeVector{bn}, op::ParameterVector{input, weights, bias, gamma, beta, mean, var});
+        return f;
+    };
+
+    auto int_f = make_function();
+    auto cpu_f = make_function();
+
+    test::Uniform<float> rng(1.0f, 100.0f);
+    vector<vector<float>> args;
+    for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
+}
+
+TEST(cpu_fusion, conv_affine_folding)
 {
    Shape shape_input{1, 8, 3, 3};
    Shape shape_weights{2, 8, 1, 1};
@@ -1728,6 +1769,48 @@ TEST(cpu_fusion, affine_folding)
    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
 }

+TEST(cpu_fusion, convbias_affine_folding)
+{
+    Shape shape_input{1, 6, 3, 3};
+    Shape shape_weights{3, 6, 1, 1};
+    Shape shape_norm{3};
+
+    auto make_function = [shape_input, shape_weights, shape_norm]() {
+        auto input = std::make_shared<op::Parameter>(element::f32, shape_input);
+        auto weights = std::make_shared<op::Parameter>(element::f32, shape_weights);
+        auto bias = std::make_shared<op::Parameter>(element::f32, Shape{3});
+
+        auto a = std::make_shared<op::Parameter>(element::f32, shape_norm);
+        auto b = std::make_shared<op::Parameter>(element::f32, shape_norm);
+        auto conv = std::make_shared<op::Convolution>(input, weights, Strides{1, 1}, Strides{1, 1});
+        auto convbias =
+            conv + std::make_shared<op::Broadcast>(bias, conv->get_shape(), AxisSet{0, 2, 3});
+        auto out = std::make_shared<op::Add>(
+            std::make_shared<op::Multiply>(
+                convbias, std::make_shared<op::Broadcast>(a, conv->get_shape(), AxisSet{0, 2, 3})),
+            std::make_shared<op::Broadcast>(b, conv->get_shape(), AxisSet{0, 2, 3}));
+        auto f =
+            make_shared<Function>(NodeVector{out}, op::ParameterVector{input, weights, bias, a, b});
+        return f;
+    };
+
+    auto int_f = make_function();
+    auto cpu_f = make_function();
+
+    test::Uniform<float> rng(20.0f, 300.0f);
+    vector<vector<float>> args;
+    for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
+}
+
 TEST(cpu_fusion, group_convolution_fusion)
 {
    Shape shape_a{1, 32, 2, 2};