Convert layout early to get more efficient MKLDNN kernels (#2071)

* hacky version of reshape+convertlayout reordering * Cleaned up reshape+convertlayout and added unit test * fix pass-through check * style fix * fix ParameterVector * reduce random input range

Convert layout early to get more efficient MKLDNN kernels (#2071)
* hacky version of reshape+convertlayout reordering * Cleaned up reshape+convertlayout and added unit test * fix pass-through check * style fix * fix ParameterVector * reduce random input range
35073346 · Jayaram Bobba · Scott Cyphers · 23c0c2fa · 35073346 · 35073346
Commit 35073346 authored Nov 21, 2018 by Jayaram Bobba Committed by Scott Cyphers Nov 21, 2018
3 changed files
--- a/src/ngraph/runtime/cpu/pass/cpu_post_layout_optimizations.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_post_layout_optimizations.cpp
@@ -170,3 +170,96 @@ void ngraph::runtime::cpu::pass::CPUPostLayoutOptimizations::construct_slice_con
    auto m = make_shared<pattern::Matcher>(cvt_lt, callback);
    this->add_matcher(m);
 }
+
+// Reshape(transpose) + ConvertLayout
+// MKLDNN has more efficient ConvertLayout kernels for named/non-padded formats
+// If a transpose is converting a padded format into a generic padded/blocked format, it is better
+// to ConvertLayout first and then do the transpose
+// E.g.,
+// Shape{10, 20, 30, 40} --(Reshape)--> Shape{10, 40, 20, 30} --(ConvertLayout)--> Shape{10, 40, 20, 30}
+// is changed to
+// Shape{10, 20, 30, 40} --(ConvertLayout)--> Shape{10, 20, 30, 40} --(Reshape)--> Shape{10, 40, 20, 30}
+// The new ConvertLayout op computes the desired output layout (out_md) directly from
+// input layout using a rotated out_md
+void ngraph::runtime::cpu::pass::CPUPostLayoutOptimizations::
+    construct_reshape_convertLayout_fusion()
+{
+    auto input = std::make_shared<pattern::op::Label>(element::f32, Shape{1, 1, 1, 1});
+    auto reshape =
+        std::make_shared<ngraph::op::Reshape>(input, AxisVector{0, 1, 2, 3}, Shape{1, 1, 1, 1});
+    auto lt_desc =
+        std::make_shared<runtime::cpu::LayoutDescriptor>(*reshape->get_output_tensor_ptr());
+    auto cvt_lt = std::make_shared<runtime::cpu::op::ConvertLayout>(reshape, lt_desc);
+
+    pattern::graph_rewrite_callback callback = [](pattern::Matcher& m) {
+        NGRAPH_DEBUG << "In a callback for construct_reshape_converLayout against "
+                     << m.get_match_root()->get_name();
+
+        auto cvt_lt_m = static_pointer_cast<runtime::cpu::op::ConvertLayout>(m.get_match_root());
+        auto reshape_m = static_pointer_cast<ngraph::op::Reshape>(cvt_lt_m->get_argument(0));
+
+        if (reshape_m->get_users().size() > 1)
+        {
+            NGRAPH_DEBUG << "ReshapeConvertLayout: Reshape has multiple users";
+            return false;
+        }
+
+        if (!reshape_m->get_is_transpose())
+        {
+            NGRAPH_DEBUG << "ReshapeConvertLayout: Reshape is not a transpose";
+            return false;
+        }
+
+        if (reshape_m->get_op_annotations()->get_in_place_oi_pairs().size() == 0)
+        {
+            NGRAPH_DEBUG << "ReshapeConvertLayout: Reshape is not pass-through";
+            return false;
+        }
+
+        auto reshape_m_md = runtime::cpu::mkldnn_utils::get_output_mkldnn_md(reshape_m.get(), 0);
+        if (reshape_m_md.data.format != mkldnn_blocked ||
+            !runtime::cpu::mkldnn_utils::is_mkldnn_padded_layout(
+                reshape_m_md, ngraph::get_default_order(reshape_m->get_shape())))
+        {
+            NGRAPH_DEBUG << "ReshapeConvertLayout: Reshape is not creating a blocked/padded layout";
+            return false;
+        }
+
+        // Rotate output layout to the pre-transposed order
+        auto out_md = runtime::cpu::mkldnn_utils::get_output_mkldnn_md(cvt_lt_m.get(), 0);
+        auto reshape_order = reshape_m->get_input_order();
+        // Get the inverse of the original transpose order
+        // E.g., [0, 3, 1, 2] -> [0, 2, 3, 1]
+        AxisVector inverse_order;
+        for (int i = 0; i < reshape_order.size(); i++)
+        {
+            inverse_order.push_back(std::find(reshape_order.begin(), reshape_order.end(), i) -
+                                    reshape_order.begin());
+        }
+        auto rotated_md = runtime::cpu::mkldnn_utils::rotate_blocked_md(out_md, inverse_order);
+        auto rotated_lt_desc = std::make_shared<runtime::cpu::LayoutDescriptor>(
+            *reshape_m->get_argument(0)->get_output_tensor_ptr());
+        rotated_lt_desc->set_mkldnn_md(rotated_md);
+
+        auto cvt_lt_n = std::make_shared<runtime::cpu::op::ConvertLayout>(
+            reshape_m->get_argument(0), 0, rotated_lt_desc);
+        cvt_lt_n->set_op_annotations(cvt_lt_m->get_op_annotations());
+
+        auto reshape_n =
+            std::make_shared<ngraph::op::Reshape>(cvt_lt_n, reshape_order, cvt_lt_m->get_shape());
+        auto reshape_n_layout = std::make_shared<ngraph::runtime::cpu::LayoutDescriptor>(
+            *reshape_n->get_output_tensor_ptr());
+        reshape_n_layout->set_mkldnn_md(out_md);
+        reshape_n->get_output_tensor_ptr()->set_tensor_layout(reshape_n_layout);
+        reshape_n->set_op_annotations(reshape_m->get_op_annotations());
+
+        ngraph::replace_node(cvt_lt_m, reshape_n);
+        NGRAPH_DEBUG << "ReshapeConvertLayout: Reordering reshape and convertlayout for faster "
+                        "MKLDNN kernels";
+
+        return true;
+    };
+
+    auto m = make_shared<pattern::Matcher>(cvt_lt, callback);
+    this->add_matcher(m);
+}
--- a/src/ngraph/runtime/cpu/pass/cpu_post_layout_optimizations.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_post_layout_optimizations.hpp
@@ -39,7 +39,9 @@ public:
    {
        construct_weight_fusion();
        construct_slice_convertLayout_fusion();
+        construct_reshape_convertLayout_fusion();
    }
    void construct_weight_fusion();
    void construct_slice_convertLayout_fusion();
+    void construct_reshape_convertLayout_fusion();
 };
--- a/test/cpu_test.cpp
+++ b/test/cpu_test.cpp
@@ -54,6 +54,28 @@ public:
    }
 };

+static void compare_backends(std::shared_ptr<Function>& f1,
+                             std::shared_ptr<Function>& f2,
+                             const string backend1,
+                             const string backend2)
+{
+    test::Uniform<float> rng(-1.0f, 1.0f);
+    vector<vector<float>> args;
+    for (shared_ptr<op::Parameter> param : f1->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+    auto f1_results = execute(f1, args, backend1);
+    auto f2_results = execute(f2, args, backend2);
+
+    for (size_t i = 0; i < f1_results.size(); i++)
+    {
+        EXPECT_TRUE(test::all_close(f1_results.at(i), f2_results.at(i)));
+    }
+}
+
 TEST(cpu_test, unhandled_op)
 {
    auto A = make_shared<op::Parameter>(element::f32, Shape{});
@@ -579,6 +601,27 @@ TEST(cpu_test, convert_layout)
    }
 }

+TEST(cpu_test, post_layout_reshape_convertlayout)
+{
+    auto make_function = []() -> std::shared_ptr<Function> {
+        auto A = make_shared<op::Parameter>(element::f32, Shape{1, 2, 3, 4});
+        auto B = make_shared<op::Parameter>(element::f32, Shape{5, 2, 1, 1});
+        auto conv = make_shared<op::Convolution>(A,
+                                                 B,
+                                                 Strides{1, 1},
+                                                 Strides{1, 1},
+                                                 CoordinateDiff{0, 0},
+                                                 CoordinateDiff{0, 0},
+                                                 Strides{1, 1});
+        auto reshape = make_shared<op::Reshape>(conv, AxisVector{0, 2, 3, 1}, Shape{1, 3, 4, 5});
+        return make_shared<Function>(NodeVector{reshape}, ParameterVector{A, B});
+    };
+
+    auto int_f = make_function();
+    auto cpu_f = make_function();
+    compare_backends(int_f, cpu_f, "INTERPRETER", "CPU");
+}
+
 TEST(cpu_test, mkldnn_layouts_eltwise)
 {
    Shape input_shape{3, 11, 14, 14};
@@ -595,15 +638,5 @@ TEST(cpu_test, mkldnn_layouts_eltwise)

    auto int_f = make_function();
    auto cpu_f = make_function();
-
-    std::vector<float> input_vec(shape_size(input_shape));
-    std::vector<float> filter_vec(shape_size(filter_shape));
-    test::Uniform<float> rand_gen(-1, 1);
-    rand_gen.initialize(input_vec);
-    rand_gen.initialize(filter_vec);
-    vector<vector<float>> args{input_vec, filter_vec};
-
-    auto int_results = execute(int_f, args, "INTERPRETER");
-    auto cpu_results = execute(cpu_f, args, "CPU");
-    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
+    compare_backends(int_f, cpu_f, "INTERPRETER", "CPU");
 }