Add in-place-slice optimization for CPU backend. (#1967)

* Add in-place-slice optimization for CPU backend. * Modify slice emitter function for in place slice. * Allow arg node to have multiple outputs for in place slice. * Remove unused variable. * Add CPUExecutionContext argument to slice builder. * Address PR feedback: move computation out of the functor. * Move size computation out of the functor for in place concat.

Add in-place-slice optimization for CPU backend. (#1967)
* Add in-place-slice optimization for CPU backend. * Modify slice emitter function for in place slice. * Allow arg node to have multiple outputs for in place slice. * Remove unused variable. * Add CPUExecutionContext argument to slice builder. * Address PR feedback: move computation out of the functor. * Move size computation out of the functor for in place concat.
65355a17 · Amy Zhuang · Robert Kimball · dfc20454 · 65355a17 · 65355a17
Commit 65355a17 authored Nov 09, 2018 by Amy Zhuang Committed by Robert Kimball Nov 09, 2018
8 changed files
--- a/src/ngraph/pass/memory_layout.cpp
+++ b/src/ngraph/pass/memory_layout.cpp
@@ -20,6 +20,7 @@
 #include "ngraph/log.hpp"
 #include "ngraph/log.hpp"
 #include "ngraph/op/concat.hpp"
+#include "ngraph/op/slice.hpp"
 #include "ngraph/pass/liveness.hpp"
 #include "ngraph/pass/manager.hpp"
 #include "ngraph/pass/memory_layout.hpp"
@@ -44,8 +45,9 @@ bool pass::MemoryLayout::run_on_function(shared_ptr<ngraph::Function> function)

        if (auto op = std::dynamic_pointer_cast<op::Op>(node))
        {
-            // concat in_place_oi should be treated differently
-            if (!std::dynamic_pointer_cast<op::Concat>(node))
+            // concat and slice in_place_oi should be treated differently
+            if (!std::dynamic_pointer_cast<op::Concat>(node) &&
+                !std::dynamic_pointer_cast<op::Slice>(node))
            {
                if (auto op_annotations = op->get_op_annotations())
                {

--- a/src/ngraph/runtime/cpu/builder/concat.cpp
+++ b/src/ngraph/runtime/cpu/builder/concat.cpp
@@ -39,6 +39,8 @@ namespace ngraph

                vector<reference_wrapper<void*>> arg_tensors;
                vector<Shape> arg_shapes;
+                vector<size_t> arg_sizes;
+                auto element_size = concat->get_input_element_type(0).size();
                for (auto& arg : args)
                {
                    if (shape_size(arg.get_shape()))
@@ -46,6 +48,7 @@ namespace ngraph
                        arg_tensors.emplace_back(
                            external_function->get_tensor_data(arg.get_name()));
                        arg_shapes.emplace_back(arg.get_shape());
+                        arg_sizes.emplace_back(shape_size(arg.get_shape()) * element_size);
                    }
                }
                auto nargs = args.size();
@@ -53,19 +56,18 @@ namespace ngraph
                auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
                auto out_shape = out[0].get_shape();

-                auto element_size = concat->get_input_element_type(0).size();
                if (auto op_annotations = concat->get_op_annotations())
                {
                    auto in_place_oi_pairs = op_annotations->get_in_place_oi_pairs();
                    if (in_place_oi_pairs.size() > 0)
                    {
-                        auto functor = [&, arg_tensors, nargs, out_shape, arg_shapes, element_size](
+                        auto out_size = shape_size(out_shape) * element_size;
+
+                        auto functor = [&, arg_tensors, nargs, out_size, arg_sizes](
                            CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
-                            auto out_size = shape_size(out_shape) * element_size;
                            auto offset = 0;
                            for (size_t i = 0; i < nargs; i++)
                            {
-                                auto arg_size = shape_size(arg_shapes[i]) * element_size;
                                // if the argument pointer does not fall within the concat output buffer
                                // (caused by propagate_in_place_output or propagate_in_place_input), we need to copy the data;
                                // otherwise, we can skip the copy.
@@ -75,9 +77,9 @@ namespace ngraph
                                {
                                    memcpy(reinterpret_cast<char*>(out_tensor) + offset,
                                           arg_tensors[i],
-                                           arg_size);
+                                           arg_sizes[i]);
                                }
-                                offset += arg_size;
+                                offset += arg_sizes[i];
                            }

                        };

--- a/src/ngraph/runtime/cpu/builder/slice.cpp
+++ b/src/ngraph/runtime/cpu/builder/slice.cpp
@@ -48,6 +48,37 @@ namespace ngraph
                auto lower_bounds = slice->get_lower_bounds();
                auto upper_bounds = slice->get_upper_bounds();

+                if (auto op_annotations = slice->get_op_annotations())
+                {
+                    auto in_place_oi_pairs = op_annotations->get_in_place_oi_pairs();
+                    if (in_place_oi_pairs.size() > 0)
+                    {
+                        auto element_size = slice->get_input_element_type(0).size();
+                        auto start = 0, accumulated = 1;
+                        for (int i = arg_shape.size() - 1; i >= 0; i--)
+                        {
+                            start += lower_bounds[i] * accumulated;
+                            accumulated *= arg_shape[i];
+                        }
+                        auto out_size = shape_size(out_shape) * element_size;
+                        auto arg_size = shape_size(arg_shape) * element_size;
+                        auto offset = start * element_size;
+
+                        auto functor = [&, out_size, arg_size, offset](CPURuntimeContext* ctx,
+                                                                       CPUExecutionContext* ectx) {
+                            if (out_tensor < arg_tensor ||
+                                out_tensor >= reinterpret_cast<char*>(arg_tensor) + arg_size)
+                            {
+                                memcpy(out_tensor,
+                                       reinterpret_cast<char*>(arg_tensor) + offset,
+                                       out_size);
+                            }
+                        };
+                        functors.emplace_back(functor);
+                        return;
+                    }
+                }
+
                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
                {
                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -2010,6 +2010,32 @@ namespace ngraph
            {
                const ngraph::op::Slice* slice = static_cast<const ngraph::op::Slice*>(node);

+                if (auto op_annotations = slice->get_op_annotations())
+                {
+                    auto in_place_oi_pairs = op_annotations->get_in_place_oi_pairs();
+                    if (in_place_oi_pairs.size() > 0)
+                    {
+                        auto arg_shape = args[0].get_shape();
+                        auto lower_bounds = slice->get_lower_bounds();
+                        auto start = 0, accumulated = 1;
+                        for (int i = arg_shape.size() - 1; i >= 0; i--)
+                        {
+                            start += lower_bounds[i] * accumulated;
+                            accumulated *= arg_shape[i];
+                        }
+                        writer << "if (" << out[0].get_name() << " < " << args[0].get_name()
+                               << " || " << out[0].get_name() << " >= " << args[0].get_name()
+                               << " + " << args[0].get_size() * out[0].get_element_type().size()
+                               << ")\n";
+                        writer.block_begin();
+                        writer << "memcpy(" << out[0].get_name() << ", " << args[0].get_name()
+                               << " + " << start * out[0].get_element_type().size() << ", "
+                               << out[0].get_size() * out[0].get_element_type().size() << ");\n";
+                        writer.block_end();
+                        return;
+                    }
+                }
+
                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
                {
                    auto out_shape = out[0].get_shape();

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -634,6 +634,9 @@ using namespace ngraph::runtime;
            }
        }

+        // In place slice optimization
+        process_in_place_slice(ordered_ops);
+
        // In place concatenation optimization
        process_in_place_concat(ordered_ops);

@@ -1085,7 +1088,7 @@ void runtime::cpu::CPU_ExternalFunction::propagate_in_place_input(
        for (auto input : it->get_inputs())
        {
            auto c_op = std::dynamic_pointer_cast<ngraph::op::Op>(input->get_node());
-            if (!c_op || c_op->is_output())
+            if (!c_op || c_op->is_output() || dynamic_pointer_cast<ngraph::op::Slice>(c_op))
            {
                continue;
            }
@@ -1179,7 +1182,7 @@ void runtime::cpu::CPU_ExternalFunction::propagate_in_place_output(
    {
        propagate_further = false;
        auto arg = std::dynamic_pointer_cast<ngraph::op::Op>(it->get_node());
-        if (!arg)
+        if (!arg || std::dynamic_pointer_cast<ngraph::op::Slice>(it->get_node()))
        {
            break;
        }
@@ -1316,6 +1319,46 @@ void runtime::cpu::CPU_ExternalFunction::propagate_in_place_concat(
    }
 }

+//slice
+void runtime::cpu::CPU_ExternalFunction::process_in_place_slice(
+    std::list<std::shared_ptr<Node>> nodes)
+{
+    for (shared_ptr<Node>& node : nodes)
+    {
+        if (auto slice = std::dynamic_pointer_cast<ngraph::op::Slice>(node))
+        {
+            if (auto op_annotations = slice->get_op_annotations())
+            {
+                auto in_place_oi_pairs = op_annotations->get_in_place_oi_pairs();
+                if (in_place_oi_pairs.size() > 0)
+                {
+                    auto arg = slice->get_argument(0);
+                    auto input = slice->get_input_from(arg);
+                    auto index = input->get_output().get_index();
+                    auto input_node = std::dynamic_pointer_cast<ngraph::op::Op>(arg);
+                    auto input_tensor = &input_node->get_output_tensor(index);
+                    auto offset = input_tensor->get_pool_offset();
+                    auto lower_bounds = slice->get_lower_bounds();
+                    auto start = 0, accumulated = 1;
+                    auto in_shape = slice->get_input_shape(0);
+                    for (int i = in_shape.size() - 1; i >= 0; i--)
+                    {
+                        start += lower_bounds[i] * accumulated;
+                        accumulated *= in_shape[i];
+                    }
+                    offset += node->get_element_type().size() * start;
+                    auto output_tensor = &slice->get_output_tensor();
+                    auto old_offset = output_tensor->get_pool_offset();
+
+                    output_tensor->set_pool_offset(offset);
+                    NGRAPH_DEBUG << "cpu_external_function: change offset, old offset is "
+                                 << old_offset << ", new offset is " << offset << std::endl;
+                }
+            }
+        }
+    }
+}
+
 void runtime::cpu::CPU_ExternalFunction::build()
 {
    if (m_is_built)
@@ -1378,6 +1421,9 @@ void runtime::cpu::CPU_ExternalFunction::build()

    // Build executor

+    // In place slice optimization
+    process_in_place_slice(m_function->get_ordered_ops());
+
    // In place concatenation optimization
    process_in_place_concat(m_function->get_ordered_ops());


--- a/src/ngraph/runtime/cpu/cpu_external_function.hpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.hpp
@@ -205,6 +205,10 @@ namespace ngraph

                // For a chain of concat ops, propagate memory pool offsets
                void propagate_in_place_concat(std::shared_ptr<ngraph::op::Concat> concat);
+
+                // Find in-place slice ops and set appropriate memory pool offset for its output
+                void process_in_place_slice(std::list<std::shared_ptr<Node>> nodes);
+
                bool computes_result(Node* node);
                void release_function() { m_function = nullptr; }
 #if !defined(NGRAPH_DEX_ONLY)

--- a/src/ngraph/runtime/cpu/pass/cpu_memory_optimization.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_memory_optimization.cpp
@@ -52,6 +52,7 @@
 #include "ngraph/graph_util.hpp"
 #include "ngraph/op/concat.hpp"
 #include "ngraph/op/constant.hpp"
+#include "ngraph/op/slice.hpp"
 #include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"

@@ -219,5 +220,87 @@ bool runtime::cpu::pass::CPUMemoryOptimization::run_on_function(std::shared_ptr<
            }
        }
    }
+
+    for (auto n : function->get_ordered_ops())
+    {
+        if (auto slice = std::dynamic_pointer_cast<op::Slice>(n))
+        {
+            auto in_shape = slice->get_input_shape(0);
+            auto out_shape = slice->get_output_shape(0);
+            auto strides = slice->get_strides();
+
+            auto lower_bounds = slice->get_lower_bounds();
+            auto upper_bounds = slice->get_upper_bounds();
+
+            auto arg = slice->get_argument(0);
+            if (std::dynamic_pointer_cast<op::Constant>(arg) ||
+                std::dynamic_pointer_cast<op::Parameter>(arg))
+            {
+                NGRAPH_DEBUG << "cpu_memory_optimization: " << arg->get_name()
+                             << ": constant or parameter, no in place slice";
+                continue;
+            }
+
+            if (is_strided(strides))
+            {
+                NGRAPH_DEBUG << "cpu_memory_optimization: strided slice, no in place slice";
+                continue;
+            }
+
+            auto product = 1;
+            int axis = in_shape.size() - 1;
+            for (int i = in_shape.size() - 1; i >= 0; i--)
+            {
+                if (in_shape[i] != out_shape[i])
+                {
+                    axis = i;
+                    break;
+                }
+            }
+            for (int i = 0; i < axis; i++)
+            {
+                product *= in_shape[i];
+            }
+            if (product != 1)
+            {
+                NGRAPH_DEBUG << "cpu_memory_optimization: The product of input shape "
+                                "before slice axis is not 1, no in place slice";
+                continue;
+            }
+
+            // check if input and output formats are the same
+            auto output_md = mkldnn_utils::get_output_mkldnn_md(n.get(), 0);
+            auto output_format = static_cast<mkldnn::memory::format>(output_md.data.format);
+            auto input_md = mkldnn_utils::get_input_mkldnn_md(n.get(), 0);
+            auto input_format = static_cast<mkldnn::memory::format>(input_md.data.format);
+            if (output_format != input_format)
+            {
+                NGRAPH_DEBUG << "cpu_memory_optimization: input format is different from "
+                                "output format, no in place slice";
+                continue;
+            }
+
+            // check if input layout is padded
+            AxisVector axis_list = ngraph::get_default_order(in_shape);
+            if (mkldnn_utils::is_mkldnn_padded_layout(input_md, axis_list))
+            {
+                NGRAPH_DEBUG << "cpu_memory_optimization: padded input layout, no in place slice";
+
+                continue;
+            }
+
+            auto op_annotations = slice->get_op_annotations();
+            if (op_annotations)
+            {
+                op_annotations->add_in_place_oi_pair({0, 0, false});
+            }
+            else
+            {
+                op_annotations = std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                op_annotations->add_in_place_oi_pair({0, 0, false});
+                slice->set_op_annotations(op_annotations);
+            }
+        }
+    }
    return false;
 }
--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -1138,6 +1138,31 @@ NGRAPH_TEST(${BACKEND_NAME}, slice_vector)
    EXPECT_EQ((vector<float>{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}), read_vector<float>(result));
 }

+NGRAPH_TEST(${BACKEND_NAME}, slice_matrix_axis_0_overlap)
+{
+    Shape shape_a{4, 4};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    auto B = make_shared<op::Parameter>(element::f32, shape_a);
+    auto C = make_shared<op::Add>(A, B);
+    Shape shape_r{2, 4};
+    auto D = make_shared<op::Slice>(C, Coordinate{0, 0}, Coordinate{2, 4});
+    auto E = make_shared<op::Slice>(C, Coordinate{1, 0}, Coordinate{3, 4});
+    auto r = make_shared<op::Add>(D, E);
+    auto f = make_shared<Function>(r, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+    auto b = backend->create_tensor(element::f32, shape_a);
+    copy_data(b, vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(f, {result}, {a, b});
+    EXPECT_EQ((vector<float>{12, 16, 20, 24, 28, 32, 36, 40}), read_vector<float>(result));
+}
+
 NGRAPH_TEST(${BACKEND_NAME}, slice_matrix_strided)
 {
    Shape shape_a{4, 4};