Use MKLDNN for non-strided slice ops. (#1575)

* Use MKLDNN for non-strided slice ops. * Remove unused variables. * Add utility function is_strided to graph_util. Add datatype check for slice in cpu_assignment. * Create blocked mkldnn memory desc for output when input format is blocked. * Add slice converLayout fusion to cpu_post_layout_optimizations. * Rename tensor_view. Fix a bug. * Rename a variable.

Use MKLDNN for non-strided slice ops. (#1575)
* Use MKLDNN for non-strided slice ops. * Remove unused variables. * Add utility function is_strided to graph_util. Add datatype check for slice in cpu_assignment. * Create blocked mkldnn memory desc for output when input format is blocked. * Add slice converLayout fusion to cpu_post_layout_optimizations. * Rename tensor_view. Fix a bug. * Rename a variable.
39700785 · Amy Zhuang · Scott Cyphers · 202fc714 · 39700785 · 39700785
Commit 39700785 authored Sep 20, 2018 by Amy Zhuang Committed by Scott Cyphers Sep 20, 2018
10 changed files
--- a/src/ngraph/graph_util.cpp
+++ b/src/ngraph/graph_util.cpp
@@ -553,3 +553,8 @@ bool ngraph::possibly_overwritten(Node* node)
    }
    return false;
 }
+
+bool ngraph::is_strided(const Strides& strides)
+{
+    return std::any_of(strides.begin(), strides.end(), [](size_t stride) { return stride != 1; });
+}
--- a/src/ngraph/graph_util.hpp
+++ b/src/ngraph/graph_util.hpp
@@ -313,4 +313,6 @@ namespace ngraph
    // Return true if a node's user could potentially overwrite
    // the output of this node with in-place kernels
    bool possibly_overwritten(Node* node);
+
+    bool is_strided(const Strides& strides);
 }
--- a/src/ngraph/runtime/cpu/builder/slice.cpp
+++ b/src/ngraph/runtime/cpu/builder/slice.cpp
@@ -48,19 +48,30 @@ namespace ngraph
                auto lower_bounds = slice->get_lower_bounds();
                auto upper_bounds = slice->get_upper_bounds();

-                bool strided = false;
-                for (auto stride : strides)
+                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
                {
-                    if (stride != 1)
-                    {
-                        strided = true;
-                        break;
-                    }
-                }
+                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                    auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
+                    auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
+
+                    auto slice_index = mkldnn_emitter->build_slice(
+                        input_desc, result_desc, lower_bounds, out_shape);
+                    auto& deps = mkldnn_emitter->get_primitive_deps(slice_index);

-                if (strided)
+                    auto functor = [&, slice_index](CPURuntimeContext* ctx) {
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, slice_index);
+                    };
+
+                    functors.emplace_back(functor);
+                }
+                else
+                {
+                    if (is_strided(strides))
                    {
-                    std::function<decltype(runtime::cpu::kernel::strided_slice<float, 2>)> kernel;
+                        std::function<decltype(runtime::cpu::kernel::strided_slice<float, 2>)>
+                            kernel;

                        SELECT_KERNEL_BY_RANK(kernel,
                                              args[0].get_element_type(),
@@ -89,13 +100,14 @@ namespace ngraph
                                              arg_shape.size(),
                                              runtime::cpu::kernel::slice);

-                    auto functor =
-                        [&, kernel, arg_shape, out_shape, lower_bounds](CPURuntimeContext* ctx) {
+                        auto functor = [&, kernel, arg_shape, out_shape, lower_bounds](
+                            CPURuntimeContext* ctx) {
                            kernel(arg_tensor, out_tensor, arg_shape, out_shape, lower_bounds);
                        };
                        functors.emplace_back(functor);
                    }
                }
+            }

            REGISTER_OP_BUILDER(Slice);
        }

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -1942,6 +1942,32 @@ namespace ngraph
            {
                const ngraph::op::Slice* slice = static_cast<const ngraph::op::Slice*>(node);

+                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
+                {
+                    auto out_shape = out[0].get_shape();
+
+                    auto lower_bounds = slice->get_lower_bounds();
+
+                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                    auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
+                    auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
+
+                    auto slice_index = mkldnn_emitter->build_slice(
+                        input_desc, result_desc, lower_bounds, out_shape);
+                    auto& deps = mkldnn_emitter->get_primitive_deps(slice_index);
+
+                    writer.block_begin();
+                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0])
+                           << ", " << args[0].get_name() << ");\n";
+                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1])
+                           << ", " << out[0].get_name() << ");\n";
+
+                    writer << "cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, "
+                           << to_string(slice_index) << ");\n";
+                    writer.block_end();
+                    return;
+                }
+
                writer.block_begin();
 #if USE_EIGEN_CORE_INLINE == 1
                size_t arg_rank = args[0].get_shape().size();

--- a/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
@@ -1113,6 +1113,37 @@ size_t MKLDNNEmitter::build_concat(const std::vector<mkldnn::memory::desc>& inpu
    return concat_index;
 }

+size_t MKLDNNEmitter::build_slice(const mkldnn::memory::desc& input_desc,
+                                  const mkldnn::memory::desc& result_desc,
+                                  const ngraph::Coordinate& lower_bounds,
+                                  const ngraph::Shape& result_shape)
+{
+    std::vector<size_t> in_out_index;
+    mkldnn::memory::primitive_desc input_pd =
+        mkldnn::memory::primitive_desc(input_desc, runtime::cpu::mkldnn_utils::global_cpu_engine);
+    size_t input_index = build_memory_primitive(input_desc);
+
+    auto dims = mkldnn::memory::dims(result_shape.begin(), result_shape.end());
+    auto offsets = mkldnn::memory::dims(lower_bounds.begin(), lower_bounds.end());
+    auto view_pd = mkldnn::view::primitive_desc(input_pd, dims, offsets).dst_primitive_desc();
+
+    mkldnn::memory::primitive_desc result_pd =
+        mkldnn::memory::primitive_desc(result_desc, runtime::cpu::mkldnn_utils::global_cpu_engine);
+    size_t result_index = build_memory_primitive(result_desc);
+
+    // reorder primitive descriptor
+    mkldnn::reorder::primitive_desc reorder_pd =
+        mkldnn::reorder::primitive_desc(view_pd, result_pd);
+    // reorder primitive
+    size_t reorder_index = insert_primitive(new mkldnn::reorder(
+        reorder_pd, *m_mkldnn_primitives[input_index], *m_mkldnn_primitives[result_index]));
+
+    in_out_index.push_back(input_index);
+    in_out_index.push_back(result_index);
+    m_primitive_deps[reorder_index] = in_out_index;
+    return reorder_index;
+}
+
 size_t MKLDNNEmitter::build_softmax_forward(const mkldnn::memory::desc& input_desc,
                                            const mkldnn::memory::desc& result_desc,
                                            int softmax_axis)

--- a/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
@@ -515,6 +515,11 @@ namespace ngraph
                                    const mkldnn::memory::desc& result_desc,
                                    const size_t concat_dim);

+                size_t build_slice(const mkldnn::memory::desc& input_desc,
+                                   const mkldnn::memory::desc& result_desc,
+                                   const ngraph::Coordinate& lower_bounds,
+                                   const ngraph::Shape& result_shape);
+
                size_t build_softmax_forward(const mkldnn::memory::desc& input_desc,
                                             const mkldnn::memory::desc& result_desc,
                                             int softmax_axis);

--- a/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
@@ -34,6 +34,7 @@
 #include "ngraph/op/max_pool.hpp"
 #include "ngraph/op/relu.hpp"
 #include "ngraph/op/replace_slice.hpp"
+#include "ngraph/op/slice.hpp"
 #include "ngraph/op/softmax.hpp"
 #include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
@@ -669,6 +670,20 @@ namespace ngraph
                    }
                }

+                template <>
+                void CPUAssignment::ASSIGN_DECL(ngraph::op::Slice)
+                {
+                    auto slice = static_cast<op::Slice*>(node);
+                    auto strides = slice->get_strides();
+                    if (!is_strided(strides) && node->get_input_element_type(0) == element::f32)
+                    {
+                        auto op_annotations =
+                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                        op_annotations->set_mkldnn_op(true);
+                        slice->set_op_annotations(op_annotations);
+                    }
+                }
+
                template <>
                void CPUAssignment::ASSIGN_DECL(ngraph::op::QuantizedMaxPool)
                {
@@ -682,6 +697,7 @@ namespace ngraph
                        quantized_mp->set_op_annotations(op_annotations);
                    }
                }
+
                template <>
                void CPUAssignment::ASSIGN_DECL(ngraph::op::QuantizedAvgPool)
                {
@@ -814,6 +830,7 @@ static const runtime::cpu::pass::AssignOpMap s_dispatcher{
    {TI(ngraph::op::QuantizedAvgPool),
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::QuantizedAvgPool>},
    {TI(ngraph::op::Softmax), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Softmax>},
+    {TI(ngraph::op::Slice), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Slice>},
    {TI(ngraph::op::Quantize), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Quantize>},
    {TI(ngraph::op::ReplaceSlice),
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ReplaceSlice>},

--- a/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
@@ -40,6 +40,7 @@
 #include "ngraph/op/reshape.hpp"
 #include "ngraph/op/result.hpp"
 #include "ngraph/op/sigmoid.hpp"
+#include "ngraph/op/slice.hpp"
 #include "ngraph/op/softmax.hpp"
 #include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
 #include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
@@ -1583,6 +1584,45 @@ namespace ngraph
                    }
                }

+                template <>
+                void CPULayout::LAYOUT_DECL(ngraph::op::Slice)
+                {
+                    if (mkldnn_utils::use_mkldnn_kernel(node.get()))
+                    {
+                        // pass input format to output
+                        auto input_md = mkldnn_utils::get_input_mkldnn_md(node.get(), 0);
+                        NGRAPH_DEBUG << "input memory format: " << input_md.data.format << "\n";
+                        auto result_format =
+                            static_cast<mkldnn::memory::format>(input_md.data.format);
+
+                        vector<memory::desc> o_mds;
+                        if (result_format == mkldnn::memory::blocked)
+                        {
+                            auto cpu_tvl = dynamic_pointer_cast<runtime::cpu::LayoutDescriptor>(
+                                node->get_inputs()[0]
+                                    .get_output()
+                                    .get_tensor_ptr()
+                                    ->get_tensor_layout());
+                            auto result_desc =
+                                mkldnn_utils::create_blocked_mkldnn_md(node->get_output_shape(0),
+                                                                       cpu_tvl->get_strides(),
+                                                                       node->get_element_type());
+                            o_mds.push_back(result_desc);
+                        }
+                        else
+                        {
+                            auto result_desc = mkldnn_utils::create_default_mkldnn_md(
+                                node.get(), 0, true, result_format);
+                            o_mds.push_back(result_desc);
+                        }
+                        set_output_layouts(node, o_mds);
+                    }
+                    else
+                    {
+                        set_native_layouts(external_function, node);
+                    }
+                }
+
                template <>
                void CPULayout::LAYOUT_DECL(ngraph::op::Concat)
                {
@@ -1783,6 +1823,7 @@ static const runtime::cpu::pass::LayoutOpMap s_dispatcher{
    {TI(ngraph::op::ConvolutionAdd),
     &runtime::cpu::pass::CPULayout::layout<ngraph::op::ConvolutionAdd>},
    {TI(ngraph::op::Dequantize), &runtime::cpu::pass::CPULayout::layout<ngraph::op::Dequantize>},
+    {TI(ngraph::op::Slice), &runtime::cpu::pass::CPULayout::layout<ngraph::op::Slice>},
    {TI(ngraph::op::Quantize), &runtime::cpu::pass::CPULayout::layout<ngraph::op::Quantize>},
 };


--- a/src/ngraph/runtime/cpu/pass/cpu_post_layout_optimizations.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_post_layout_optimizations.cpp
@@ -24,12 +24,15 @@
 #include "ngraph/op/convolution.hpp"
 #include "ngraph/op/parameter.hpp"
 #include "ngraph/op/reshape.hpp"
+#include "ngraph/op/slice.hpp"
 #include "ngraph/pass/graph_rewrite.hpp"
 #include "ngraph/pass/manager.hpp"
 #include "ngraph/pattern/matcher.hpp"
 #include "ngraph/pattern/op/label.hpp"
 #include "ngraph/pattern/op/skip.hpp"
 #include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
+#include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
+#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
 #include "ngraph/runtime/cpu/op/convert_layout.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_post_layout_optimizations.hpp"

@@ -116,3 +119,54 @@ void ngraph::runtime::cpu::pass::CPUPostLayoutOptimizations::construct_weight_fu
    auto m = make_shared<pattern::Matcher>(conv, callback);
    this->add_matcher(m);
 }
+
+void ngraph::runtime::cpu::pass::CPUPostLayoutOptimizations::construct_slice_convertLayout_fusion()
+{
+    auto param = std::make_shared<pattern::op::Label>(element::f32, Shape{1, 576, 17, 17});
+    auto slice = std::make_shared<ngraph::op::Slice>(
+        param, Coordinate{0, 0, 0, 0}, Coordinate{1, 192, 17, 17});
+    auto tvt = slice->get_outputs().at(0).get_tensor_ptr().get();
+    auto lt_desc = std::make_shared<runtime::cpu::LayoutDescriptor>(*tvt);
+    auto cvt_lt = std::make_shared<runtime::cpu::op::ConvertLayout>(slice, lt_desc);
+
+    pattern::graph_rewrite_callback callback = [param](pattern::Matcher& m) {
+        NGRAPH_DEBUG << "In a callback for construct_slice_converLayout against "
+                     << m.get_match_root()->get_name();
+
+        auto m_cvt_lt = m.get_match_root();
+        auto m_slice = m_cvt_lt->get_argument(0);
+        auto slice_ptr = static_cast<const ngraph::op::Slice*>(m_slice.get());
+        // do the fusion if slice has 1 user and uses mkldnn kernel.
+        if (!runtime::cpu::mkldnn_utils::use_mkldnn_kernel(slice_ptr) ||
+            m_slice->get_users().size() != 1)
+        {
+            return false;
+        }
+
+        for (auto u : m.get_pattern_map()[param]->get_users())
+        {
+            if (u != m_slice)
+            {
+                continue;
+            }
+
+            auto new_slice = std::make_shared<ngraph::op::Slice>(m_slice->get_argument(0),
+                                                                 slice_ptr->get_lower_bounds(),
+                                                                 slice_ptr->get_upper_bounds(),
+                                                                 slice_ptr->get_strides());
+            auto op_annotations = std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+            op_annotations->set_mkldnn_op(true);
+            new_slice->set_op_annotations(op_annotations);
+            auto tv = new_slice->get_output_tensor_ptr(0);
+            auto layout = std::make_shared<ngraph::runtime::cpu::LayoutDescriptor>(*tv);
+            layout->set_mkldnn_md(mkldnn_utils::get_output_mkldnn_md(m_cvt_lt.get(), 0));
+            tv->set_tensor_layout(layout);
+            ngraph::replace_node(m_cvt_lt, new_slice);
+        }
+
+        return true;
+    };
+
+    auto m = make_shared<pattern::Matcher>(cvt_lt, callback);
+    this->add_matcher(m);
+}
--- a/src/ngraph/runtime/cpu/pass/cpu_post_layout_optimizations.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_post_layout_optimizations.hpp
@@ -38,6 +38,8 @@ public:
        : GraphRewrite()
    {
        construct_weight_fusion();
+        construct_slice_convertLayout_fusion();
    }
    void construct_weight_fusion();
+    void construct_slice_convertLayout_fusion();
 };