Add CPU horizontal fusion pass for inception. (#1577)

* Add CPU horizontal fusion pass for inception. * Name change. * Move horizontal fusion to cpu_fusion. * Change horizontal fusion pass for inception to a general horizontal fusion pass. Add a unit test conv_horizontal_fusion to cpu_fusion. * Rename files. * Correct cpu_fusion.hpp. * Add NGRAPH_DEBUG. * Set native layout when input format of slice is nChw16c or nChw8c and lower bound of channels is not a multiple of 16 or 8.

Add CPU horizontal fusion pass for inception. (#1577)
* Add CPU horizontal fusion pass for inception. * Name change. * Move horizontal fusion to cpu_fusion. * Change horizontal fusion pass for inception to a general horizontal fusion pass. Add a unit test conv_horizontal_fusion to cpu_fusion. * Rename files. * Correct cpu_fusion.hpp. * Add NGRAPH_DEBUG. * Set native layout when input format of slice is nChw16c or nChw8c and lower bound of channels is not a multiple of 16 or 8.
2d2b3b2f · Amy Zhuang · Robert Kimball · 7da3ec33 · 2d2b3b2f · 2d2b3b2f
Commit 2d2b3b2f authored Sep 21, 2018 by Amy Zhuang Committed by Robert Kimball Sep 21, 2018
7 changed files
--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -102,6 +102,7 @@ set(SRC
    pass/cpu_collapse_dims.cpp
    pass/cpu_concat_inputs.cpp
    pass/cpu_fusion.cpp
+    pass/cpu_horizontal_fusion.cpp
    pass/cpu_layout.cpp
    pass/cpu_loop_kernel_fusion.cpp
    pass/cpu_mat_fusion.cpp

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -160,6 +160,7 @@
 #include "ngraph/runtime/cpu/pass/cpu_collapse_dims.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_concat_inputs.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_fusion.hpp"
+#include "ngraph/runtime/cpu/pass/cpu_horizontal_fusion.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_layout.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_mat_fusion.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_post_layout_optimizations.hpp"
@@ -387,6 +388,7 @@ void runtime::cpu::CPU_ExternalFunction::register_common_passes(ngraph::pass::Ma
    pass_manager.register_pass<ngraph::pass::CommonSubexpressionElimination>();
    pass_manager.register_pass<ngraph::pass::CoreFusion>();
    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
+    pass_manager.register_pass<runtime::cpu::pass::CPUHorizontalFusion>();
    pass_manager.register_pass<runtime::cpu::pass::CPUCollapseDims>();
    NodeVector nv_cwi; // We dont need CPUWorkspaceInsertion to return list of indices
    pass_manager.register_pass<runtime::cpu::pass::CPUWorkspaceInsertion>(nv_cwi, false);
@@ -1138,7 +1140,6 @@ void runtime::cpu::CPU_ExternalFunction::build()
    m_mkldnn_emitter.reset(new MKLDNNEmitter());
    ngraph::pass::Manager pass_manager;
    register_common_passes(pass_manager);
    pass_manager.register_pass<ngraph::pass::Liveness>();
    pass_manager.register_pass<ngraph::pass::MemoryLayout>(size_t(s_memory_pool_alignment), true);
    pass_manager.run_passes(m_function, false);

--- a/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
@@ -26,12 +26,14 @@
 #include "ngraph/op/batch_norm.hpp"
 #include "ngraph/op/broadcast.hpp"
 #include "ngraph/op/broadcast.hpp"
+#include "ngraph/op/concat.hpp"
 #include "ngraph/op/constant.hpp"
 #include "ngraph/op/convolution.hpp"
 #include "ngraph/op/divide.hpp"
 #include "ngraph/op/dot.hpp"
 #include "ngraph/op/exp.hpp"
 #include "ngraph/op/get_output_element.hpp"
+#include "ngraph/op/max_pool.hpp"
 #include "ngraph/op/maximum.hpp"
 #include "ngraph/op/minimum.hpp"
 #include "ngraph/op/multiply.hpp"
@@ -41,6 +43,7 @@
 #include "ngraph/op/relu.hpp"
 #include "ngraph/op/reshape.hpp"
 #include "ngraph/op/sigmoid.hpp"
+#include "ngraph/op/slice.hpp"
 #include "ngraph/op/sqrt.hpp"
 #include "ngraph/op/subtract.hpp"
 #include "ngraph/op/sum.hpp"

--- a/src/ngraph/runtime/cpu/pass/cpu_horizontal_fusion.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_horizontal_fusion.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "ngraph/runtime/cpu/pass/cpu_horizontal_fusion.hpp"
+#include "ngraph/graph_util.hpp"
+#include "ngraph/log.hpp"
+#include "ngraph/op/avg_pool.hpp"
+#include "ngraph/op/concat.hpp"
+#include "ngraph/op/convolution.hpp"
+#include "ngraph/op/slice.hpp"
+#include "ngraph/pass/graph_rewrite.hpp"
+#include "ngraph/pass/manager.hpp"
+#include "ngraph/pattern/matcher.hpp"
+#include "ngraph/pattern/op/label.hpp"
+#include "ngraph/runtime/cpu/op/conv_bias.hpp"
+using namespace ngraph;
+using namespace std;
+void ngraph::runtime::cpu::pass::CPUHorizontalFusion::cpu_conv_horizontal_fusion()
+{
+    auto has_multiple_users = [](std::shared_ptr<Node> n) {
+        auto inputs = n->get_output_inputs(0);
+        return inputs.size() > 1;
+    };
+    auto data_conv = std::make_shared<pattern::op::Label>(
+        element::f32, Shape{1, 256, 35, 35}, has_multiple_users);
+    auto filters = std::make_shared<pattern::op::Label>(element::f32, Shape{64, 256, 1, 1});
+    auto bias = std::make_shared<pattern::op::Label>(element::f32, Shape{64});
+    auto conv_bias = std::make_shared<ngraph::op::ConvolutionBias>(data_conv,
+                                                                   filters,
+                                                                   bias,
+                                                                   Strides{1, 1},
+                                                                   Strides{1, 1},
+                                                                   CoordinateDiff{0, 0},
+                                                                   CoordinateDiff{0, 0},
+                                                                   Strides{1, 1},
+                                                                   true);
+    pattern::graph_rewrite_callback callback = [data_conv](pattern::Matcher& m) {
+        NGRAPH_DEBUG << "conv_horizontal_fusion: In a callback for conv horizontal fusion for "
+                     << m.get_match_root()->get_name();
+        auto conv_bias_root = std::dynamic_pointer_cast<op::ConvolutionBias>(m.get_match_root());
+        //check if the node has been replaced
+        if (conv_bias_root->get_users().empty())
+        {
+            return false;
+        }
+        auto m_filters_shape = conv_bias_root->get_input_shape(1);
+        auto f_h = m_filters_shape[2];
+        auto f_w = m_filters_shape[3];
+        // get weights and bias from each CBR and create Concat nodes
+        std::vector<std::shared_ptr<Node>> weights_nodes;
+        std::vector<std::shared_ptr<Node>> bias_nodes;
+        std::vector<std::shared_ptr<Node>> conv_bias_nodes;
+        for (auto u : m.get_pattern_map()[data_conv]->get_users())
+        {
+            if (!pattern::has_class<ngraph::op::ConvolutionBias>()(u))
+            {
+                continue;
+            }
+            if (u->get_argument(0) != m.get_pattern_map()[data_conv])
+            {
+                NGRAPH_DEBUG << "conv_horizontal_fusion: data_conv is not input 0 for "
+                             << u->get_name() << "\n";
+                continue;
+            }
+            auto u_filters_shape = u->get_input_shape(1);
+            if (u_filters_shape[2] != f_h || u_filters_shape[3] != f_w)
+            {
+                NGRAPH_DEBUG
+                    << "conv_horizontal_fusion: skip conv node with different filter shape\n";
+                continue;
+            }
+            weights_nodes.push_back(u->get_argument(1));
+            bias_nodes.push_back(u->get_argument(2));
+            conv_bias_nodes.push_back(u);
+        }
+        if (conv_bias_nodes.size() <= 1)
+        {
+            NGRAPH_DEBUG << "conv_horizontal_fusion: need more than one nodes to do fusion\n";
+            return false;
+        }
+        auto concat_weights = std::make_shared<ngraph::op::Concat>(weights_nodes, 0);
+        auto concat_bias = std::make_shared<ngraph::op::Concat>(bias_nodes, 0);
+        auto conv_bias_new = std::make_shared<ngraph::op::ConvolutionBias>(
+            conv_bias_root->get_argument(0),
+            concat_weights,
+            concat_bias,
+            conv_bias_root->get_window_movement_strides(),
+            conv_bias_root->get_window_dilation_strides(),
+            conv_bias_root->get_padding_below(),
+            conv_bias_root->get_padding_above(),
+            conv_bias_root->get_data_dilation_strides(),
+            conv_bias_root->with_relu());
+        NGRAPH_DEBUG << "conv_horizontal_fusion: new cb shape "
+                     << conv_bias_new->get_output_shape(0) << "\n";
+        //slice
+        size_t index = 0;
+        for (auto cb : conv_bias_nodes)
+        {
+            auto slice_shape = cb->get_output_shape(0);
+            NGRAPH_DEBUG << "conv_horizontal_fusion: slice shape " << slice_shape << "\n";
+            auto lower_bounds = Coordinate{0, index, 0, 0};
+            index += slice_shape[1];
+            auto upper_bounds = Coordinate{slice_shape[0], index, slice_shape[2], slice_shape[2]};
+            NGRAPH_DEBUG << "conv_horizontal_fusion: lower_bounds " << lower_bounds << "\n";
+            NGRAPH_DEBUG << "conv_horizontal_fusion: upper_bounds " << upper_bounds << "\n";
+            auto slice =
+                std::make_shared<ngraph::op::Slice>(conv_bias_new, lower_bounds, upper_bounds);
+            ngraph::replace_node(cb, slice);
+        }
+        return true;
+    };
+    auto m = make_shared<pattern::Matcher>(conv_bias, callback);
+    this->add_matcher(m);
+}
--- a/src/ngraph/runtime/cpu/pass/cpu_horizontal_fusion.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_horizontal_fusion.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include "ngraph/pass/graph_rewrite.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace pass
+            {
+                class CPUHorizontalFusion;
+            }
+        }
+    }
+}
+class ngraph::runtime::cpu::pass::CPUHorizontalFusion : public ngraph::pass::GraphRewrite
+{
+public:
+    CPUHorizontalFusion()
+        : GraphRewrite()
+    {
+        cpu_conv_horizontal_fusion();
+    }
+private:
+    void cpu_conv_horizontal_fusion();
+};
--- a/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
@@ -1585,6 +1585,33 @@ namespace ngraph
                        auto result_format =
                            static_cast<mkldnn::memory::format>(input_md.data.format);
+                        auto slice = static_cast<ngraph::op::Slice*>(node.get());
+                        auto lower_bounds = slice->get_lower_bounds();
+                        if (result_format == mkldnn::memory::nChw16c)
+                        {
+                            // check lower bound of channels
+                            if (lower_bounds[1] % 16 != 0)
+                            {
+                                NGRAPH_DEBUG
+                                    << "slice nChw16c: lower bound of channels not multiple of 16, "
+                                       "set native layout\n";
+                                set_native_layouts(external_function, node);
+                                return;
+                            }
+                        }
+                        else if (result_format == mkldnn::memory::nChw8c)
+                        {
+                            // check lower bound of channels
+                            if (lower_bounds[1] % 8 != 0)
+                            {
+                                NGRAPH_DEBUG
+                                    << "slice nChw8C: lower bound of channels not multiple of 8,"
+                                       "set native layout\n";
+                                set_native_layouts(external_function, node);
+                                return;
+                            }
+                        }
                        vector<memory::desc> o_mds;
                        if (result_format == mkldnn::memory::blocked)
                        {

--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp
@@ -890,6 +890,57 @@ TEST(cpu_fusion, conv_bias_relu_n2c1h2w2_2)
    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
 }
+TEST(cpu_fusion, conv_horizontal_fusion)
+{
+    Shape shape_a{2, 1, 6, 6};
+    Shape shape_weights{1, 1, 2, 2};
+    Shape shape_bias{1};
+    auto make_function = [shape_a, shape_weights, shape_bias]() {
+        auto A = std::make_shared<op::Parameter>(element::f32, shape_a);
+        auto weights1 = std::make_shared<op::Parameter>(element::f32, shape_weights);
+        auto conv1 = std::make_shared<op::Convolution>(A, weights1, Strides{2, 2}, Strides{1, 1});
+        auto bias1 = std::make_shared<op::Parameter>(element::f32, shape_bias);
+        auto conv_bias1 =
+            conv1 + std::make_shared<op::Broadcast>(bias1, conv1->get_shape(), AxisSet{0, 2, 3});
+        auto relu1 = std::make_shared<op::Relu>(conv_bias1);
+        auto weights2 = std::make_shared<op::Parameter>(element::f32, shape_weights);
+        auto conv2 = std::make_shared<op::Convolution>(A, weights2, Strides{2, 2}, Strides{1, 1});
+        auto bias2 = std::make_shared<op::Parameter>(element::f32, shape_bias);
+        auto conv_bias2 =
+            conv2 + std::make_shared<op::Broadcast>(bias2, conv2->get_shape(), AxisSet{0, 2, 3});
+        auto relu2 = std::make_shared<op::Relu>(conv_bias2);
+        auto concat = std::make_shared<op::Concat>(NodeVector{relu1, relu2}, 1);
+        auto f = make_shared<Function>(NodeVector{concat},
+                                       op::ParameterVector{A, weights1, bias1, weights2, bias2});
+        return f;
+    };
+    auto int_f = make_function();
+    auto cpu_f = make_function();
+    vector<vector<float>> args{
+        {1.25f,  2.25f, 5.25f, 6.25f,  -1.25f, -1.25f, 3.25f, -4.25f, 7.25f,  8.25f,  -1.25f,
+         -1.25f, 1.25f, 2.25f, -3.25f, 2.25f,  4.25f,  4.25f, 1.25f,  2.25f,  -4.25f, 2.25f,
+         4.25f,  4.25f, 0.f,   0.f,    -1.f,   0.f,    2.f,   2.f,    0.f,    0.f,    0.f,
+         0.f,    2.f,   2.f,   1.25f,  2.25f,  5.25f,  6.25f, 1.25f,  1.25f,  3.25f,  4.25f,
+         -7.25f, 8.25f, 1.25f, -1.25f, -1.25f, 2.25f,  3.25f, 2.25f,  -4.25f, -4.25f, -1.25f,
+         -2.25f, 4.25f, 2.25f, 4.25f,  4.25f,  0.f,    0.f,   1.f,    0.f,    -2.f,   2.f,
+         0.f,    0.f,   0.f,   0.f,    -2.f,   -2.f},
+        {2., 2., 2., 2.},
+        {0.1f},
+        {3., 3., 3., 3.},
+        {0.2f}};
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
+    size_t cpu_cb = count_ops_of_type<op::ConvolutionBias>(cpu_f);
+    ASSERT_EQ(cpu_cb, 1);
+}
 // ConvolutionBiasAdd relies on an in-place fused MKLDNN kernel.
 // Need to ensure that it is fused only when in-place buffer allocation is feasible
 shared_ptr<Function> gen_conv_bias_add(bool param_input, bool result_output)