Enable in place slice when slice's arg is function input. (#2214)

* Enable in place slice when slice's arg is function input. * Add a corner case. * Add codegen support. * Set the correct offset for in-place-slice when there is a chain of in place ops starting from parameter.

Enable in place slice when slice's arg is function input. (#2214)
* Enable in place slice when slice's arg is function input. * Add a corner case. * Add codegen support. * Set the correct offset for in-place-slice when there is a chain of in place ops starting from parameter.
02d4aa59 · Amy Zhuang · Robert Kimball · 1234eb97 · 02d4aa59 · 02d4aa59
Commit 02d4aa59 authored Dec 19, 2018 by Amy Zhuang Committed by Robert Kimball Dec 19, 2018
4 changed files
--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -733,6 +733,7 @@ using namespace ngraph::runtime;
            for (size_t i = 0; i < param->get_output_size(); ++i)
            {
                shared_ptr<descriptor::Tensor> tv = param->get_output_tensor_ptr(i);
+                function_input_name_index[tv->get_name()] = arg_index;
                const element::Type& et = tv->get_element_type();
                string type = et.c_type_string();
                stringstream ss;
@@ -756,6 +757,16 @@ using namespace ngraph::runtime;
                for (descriptor::Tensor* tensor : node->liveness_new_list)
                {
                    stringstream ss;
+                    auto ele = m_variable_input_index_offset_map.find(tensor->get_name());
+                    if (ele != m_variable_input_index_offset_map.end())
+                    {
+                        ss << "(((" << tensor->get_element_type().c_type_string() << "*)(inputs["
+                           << ele->second.first << "])) + "
+                           << ele->second.second / tensor->get_element_type().size() << ")";
+                        m_variable_name_map[tensor->get_name()] = ss.str();
+                    }
+                    else
+                    {
                        ss << "((" << tensor->get_element_type().c_type_string()
                           << "*)(pool_base_ptr + " << tensor->get_pool_offset() << "))";
                        if (m_tensor_roles.find(tensor->get_name()) == m_tensor_roles.end())
@@ -766,6 +777,7 @@ using namespace ngraph::runtime;
                    }
                }
            }
+        }

        // Add outputs to the variable name map
        for (size_t i = 0; i < current_function->get_output_size(); ++i)
@@ -1395,14 +1407,106 @@ void runtime::cpu::CPU_ExternalFunction::process_in_place_slice(
                    auto arg = input->get_output().get_node();
                    auto index = input->get_output().get_index();
                    auto input_tensor = &arg->get_output_tensor(index);
+                    auto offset = input_tensor->get_pool_offset();
+                    auto lower_bounds = slice->get_lower_bounds();
+                    auto start = 0, accumulated = 1;
+                    auto in_shape = slice->get_input_shape(0);
+                    for (int i = in_shape.size() - 1; i >= 0; i--)
+                    {
+                        start += lower_bounds[i] * accumulated;
+                        accumulated *= in_shape[i];
+                    }
+                    auto output_tensor = &slice->get_output_tensor();
+
+                    if (m_tensor_roles.find(output_tensor->get_name()) != m_tensor_roles.end() &&
+                        m_tensor_roles[output_tensor->get_name()] == CPUTensorRole::INPUT)
+                    {
+                        //already processed in propagate_in_place_slice
+                        NGRAPH_DEBUG << "cpu_external_function: " << slice->get_name()
+                                     << " already processed in propagate_in_place_slice.";
+                        continue;
+                    }
+
                    if (m_tensor_roles.find(input_tensor->get_name()) != m_tensor_roles.end() &&
                        m_tensor_roles[input_tensor->get_name()] == CPUTensorRole::INPUT)
                    {
                        NGRAPH_DEBUG << "cpu_external_function: function input pointer passed to "
-                                        "slice, do not change offset.";
+                                        "slice.";
+
+                        auto name = input_tensor->get_name();
+                        if (tensor_alias.count(name))
+                        {
+                            name = tensor_alias[name];
+                        }
+                        auto input_index = function_input_name_index[name];
+                        auto input_offset = slice->get_element_type().size() * start;
+                        intermediate_input_index_offset.emplace_back(
+                            tensor_data[output_tensor->get_name()], input_index, input_offset);
+
+                        // for codegen
+                        m_variable_input_index_offset_map[output_tensor->get_name()] =
+                            std::pair<size_t, size_t>(input_index, input_offset);
+
+                        m_tensor_roles[output_tensor->get_name()] = CPUTensorRole::INPUT;
+
+                        for (size_t i = 0; i < slice->get_output_size(); ++i)
+                        {
+                            NGRAPH_DEBUG << "cpu_external_function: call propagate_in_place_slice "
+                                            "for output "
+                                         << i << " of " << slice->get_name() << std::endl;
+                            propagate_in_place_slice(
+                                &slice->get_outputs().at(i), input_index, input_offset);
+                        }
+                    }
+                    else
+                    {
+                        offset += node->get_element_type().size() * start;
+                        auto old_offset = output_tensor->get_pool_offset();
+
+                        output_tensor->set_pool_offset(offset);
+                        NGRAPH_DEBUG
+                            << "cpu_external_function: slice, change offset, old offset is "
+                            << old_offset << ", new offset is " << offset;
+                    }
+                }
+            }
+        }
+    }
+}
+
+void runtime::cpu::CPU_ExternalFunction::propagate_in_place_slice(
+    ngraph::descriptor::Output* output, size_t input_index, size_t input_offset)
+{
+    std::deque<std::pair<ngraph::descriptor::Output*, size_t>> stack;
+    stack.push_front(std::pair<ngraph::descriptor::Output*, size_t>(output, input_offset));
+
+    while (stack.size() > 0)
+    {
+        ngraph::descriptor::Output* it = stack.front().first;
+        auto offset = stack.front().second;
+        stack.pop_front();
+        for (auto input : it->get_inputs())
+        {
+            auto input_node = input->get_node();
+            if (!input_node->is_op() || input_node->is_output() ||
+                std::dynamic_pointer_cast<ngraph::op::Concat>(input_node))
+            {
                continue;
            }
-                    auto offset = input_tensor->get_pool_offset();
+
+            auto c_op = std::static_pointer_cast<ngraph::op::Op>(input_node);
+
+            if (auto op_annotations = c_op->get_op_annotations())
+            {
+                for (auto oi_pair : op_annotations->get_in_place_oi_pairs())
+                {
+                    if (oi_pair.input == input->get_index() && !oi_pair.destructive)
+                    {
+                        size_t output_index = oi_pair.output;
+                        auto& output_tensor = c_op->get_outputs().at(output_index).get_tensor();
+                        auto temp_offset = offset;
+                        if (auto slice = std::dynamic_pointer_cast<ngraph::op::Slice>(input_node))
+                        {
                            auto lower_bounds = slice->get_lower_bounds();
                            auto start = 0, accumulated = 1;
                            auto in_shape = slice->get_input_shape(0);
@@ -1411,13 +1515,19 @@ void runtime::cpu::CPU_ExternalFunction::process_in_place_slice(
                                start += lower_bounds[i] * accumulated;
                                accumulated *= in_shape[i];
                            }
-                    offset += node->get_element_type().size() * start;
-                    auto output_tensor = &slice->get_output_tensor();
-                    auto old_offset = output_tensor->get_pool_offset();
+                            temp_offset += slice->get_element_type().size() * start;
+                        }
+                        intermediate_input_index_offset.emplace_back(
+                            tensor_data[output_tensor.get_name()], input_index, temp_offset);
+                        stack.push_back(std::pair<ngraph::descriptor::Output*, size_t>(
+                            &c_op->get_outputs().at(output_index), temp_offset));

-                    output_tensor->set_pool_offset(offset);
-                    NGRAPH_DEBUG << "cpu_external_function: slice, change offset, old offset is "
-                                 << old_offset << ", new offset is " << offset;
+                        // for codegen
+                        m_variable_input_index_offset_map[output_tensor.get_name()] =
+                            std::pair<size_t, size_t>(input_index, temp_offset);
+
+                        m_tensor_roles[output_tensor.get_name()] = CPUTensorRole::INPUT;
+                    }
                }
            }
        }
@@ -1511,6 +1621,7 @@ void runtime::cpu::CPU_ExternalFunction::build()
        for (size_t i = 0; i < param->get_output_size(); ++i)
        {
            shared_ptr<descriptor::Tensor> tv = param->get_output_tensor_ptr(i);
+            function_input_name_index[tv->get_name()] = arg_index;
            function_input_index.emplace_back(
                tensor_data[tv->get_name()], arg_index, tensor_stale[tv->get_name()]);
            m_tensor_roles[tv->get_name()] = CPUTensorRole::INPUT;
@@ -1733,6 +1844,11 @@ void runtime::cpu::CPU_ExternalFunction::build()
            }
        }

+        for (auto& p : intermediate_input_index_offset)
+        {
+            get<0>(p).get() = static_cast<uint8_t*>(inputs[get<1>(p)]) + get<2>(p);
+        }
+
        for (const auto& p : function_input_index)
        {
            get<0>(p).get() = inputs[get<1>(p)];

--- a/src/ngraph/runtime/cpu/cpu_external_function.hpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.hpp
@@ -210,6 +210,11 @@ namespace ngraph
                // Find in-place slice ops and set appropriate memory pool offset for its output
                void process_in_place_slice(std::list<std::shared_ptr<Node>> nodes);

+                // propagate slice when its arg comes from function input
+                void propagate_in_place_slice(ngraph::descriptor::Output* output,
+                                              size_t input_index,
+                                              size_t input_offset);
+
                bool computes_result(Node* node);
                void release_function() { m_function = nullptr; }
 #if !defined(NGRAPH_DEX_ONLY)
@@ -255,6 +260,8 @@ namespace ngraph
                bool m_direct_execution;
                EntryPoint m_compiled_function;
                std::unordered_map<std::string, std::string> m_variable_name_map;
+                std::unordered_map<std::string, std::pair<std::size_t, std::size_t>>
+                    m_variable_input_index_offset_map;

                std::unordered_map<std::string, CPUTensorRole> m_tensor_roles;

@@ -277,7 +284,10 @@ namespace ngraph
                std::unordered_map<std::string, void*> tensor_data;
                std::unordered_map<std::string, bool> tensor_stale;
                std::unordered_map<std::string, std::string> tensor_alias;
+                std::unordered_map<std::string, size_t> function_input_name_index;
                std::list<std::pair<std::reference_wrapper<void*>, size_t>> intermediates_offsets;
+                std::list<std::tuple<std::reference_wrapper<void*>, size_t, size_t>>
+                    intermediate_input_index_offset;
                std::list<
                    std::tuple<std::reference_wrapper<void*>, size_t, std::reference_wrapper<bool>>>
                    function_input_index;

--- a/src/ngraph/runtime/cpu/pass/cpu_memory_optimization.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_memory_optimization.cpp
@@ -235,11 +235,30 @@ bool runtime::cpu::pass::CPUMemoryOptimization::run_on_function(std::shared_ptr<
            auto upper_bounds = slice->get_upper_bounds();

            auto arg = slice->get_argument(0);
-            if (std::dynamic_pointer_cast<op::Constant>(arg) ||
-                std::dynamic_pointer_cast<op::Parameter>(arg))
+
+            if (arg->is_constant())
            {
                NGRAPH_DEBUG << "cpu_memory_optimization: " << arg->get_name()
-                             << ": constant or parameter, no in place slice";
+                             << ": constant, no in place slice";
+                continue;
+            }
+
+            bool no_in_place_slice = false;
+            if (arg->is_parameter())
+            {
+                for (auto user : slice->get_users())
+                {
+                    if (user->is_output())
+                    {
+                        NGRAPH_DEBUG << "cpu_memory_optimization: slice between function input and "
+                                        "output, no in place slice";
+                        no_in_place_slice = true;
+                        break;
+                    }
+                }
+            }
+            if (no_in_place_slice)
+            {
                continue;
            }


--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -1528,6 +1528,96 @@ NGRAPH_TEST(${BACKEND_NAME}, slice_matrix_axis_0_overlap)
    EXPECT_EQ((vector<float>{12, 16, 20, 24, 28, 32, 36, 40}), read_vector<float>(result));
 }

+NGRAPH_TEST(${BACKEND_NAME}, slice_matrix_axis_0_in_place)
+{
+    Shape shape_a{4, 4};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_r{2, 4};
+    auto D = make_shared<op::Slice>(A, Coordinate{0, 0}, Coordinate{2, 4});
+    auto E = make_shared<op::Slice>(A, Coordinate{2, 0}, Coordinate{4, 4});
+    auto r = make_shared<op::Add>(D, E);
+    auto f = make_shared<Function>(r, ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(backend->compile(f), {result}, {a});
+    EXPECT_EQ((vector<float>{10, 12, 14, 16, 18, 20, 22, 24}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, slice_matrix_axis_0_in_place_twice)
+{
+    Shape shape_a{4, 4};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_r{1, 4};
+    auto B = make_shared<op::Slice>(A, Coordinate{0, 0}, Coordinate{2, 4});
+    auto D = make_shared<op::Slice>(B, Coordinate{1, 0}, Coordinate{2, 4});
+    auto E = make_shared<op::Slice>(A, Coordinate{2, 0}, Coordinate{3, 4});
+    auto r = make_shared<op::Add>(D, E);
+    auto f = make_shared<Function>(r, ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a, vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(backend->compile(f), {result}, {a});
+    EXPECT_EQ((vector<float>{14, 16, 18, 20}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, slice_matrix_axis_0_in_place_twice_overlap)
+{
+    Shape shape_a{5, 4};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_r{2, 4};
+    auto B = make_shared<op::Slice>(A, Coordinate{1, 0}, Coordinate{5, 4});
+    auto D = make_shared<op::Slice>(B, Coordinate{1, 0}, Coordinate{3, 4});
+    auto E = make_shared<op::Slice>(B, Coordinate{2, 0}, Coordinate{4, 4});
+    auto r = make_shared<op::Add>(D, E);
+    auto f = make_shared<Function>(r, ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a,
+              vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(backend->compile(f), {result}, {a});
+    EXPECT_EQ((vector<float>{22, 24, 26, 28, 30, 32, 34, 36}), read_vector<float>(result));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, slice_matrix_axis_0_in_place_with_reshape)
+{
+    Shape shape_a{4, 5};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_r{2, 4};
+    auto B = make_shared<op::Slice>(A, Coordinate{1, 0}, Coordinate{4, 5});
+    auto C = make_shared<op::Reshape>(B, AxisVector{1, 0}, Shape{5, 3});
+    auto D = make_shared<op::Slice>(C, Coordinate{1, 0}, Coordinate{5, 3});
+    auto E = make_shared<op::Reshape>(D, AxisVector{1, 0}, Shape{3, 4});
+    auto r = make_shared<op::Slice>(E, Coordinate{1, 0}, Coordinate{3, 4});
+    auto f = make_shared<Function>(r, ParameterVector{A});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::f32, shape_a);
+    copy_data(a,
+              vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20});
+    auto result = backend->create_tensor(element::f32, shape_r);
+
+    backend->call_with_validate(backend->compile(f), {result}, {a});
+    EXPECT_EQ((vector<float>{12, 13, 14, 15, 17, 18, 19, 20}), read_vector<float>(result));
+}
+
 NGRAPH_TEST(${BACKEND_NAME}, slice_matrix_strided)
 {
    Shape shape_a{4, 4};