Refactor code and add comments.

2d0871f5 · amy.zhuang · b6197967 · 2d0871f5 · 2d0871f5 · 2d0871f5
Commit 2d0871f5 authored Oct 16, 2018 by amy.zhuang
4 changed files
--- a/src/ngraph/runtime/cpu/builder/concat.cpp
+++ b/src/ngraph/runtime/cpu/builder/concat.cpp
@@ -66,6 +66,9 @@ namespace ngraph
                            for (size_t i = 0; i < nargs; i++)
                            {
                                auto arg_size = shape_size(arg_shapes[i]) * element_size;
+                                // if the argument pointer does not fall within the concat output buffer
+                                // (caused by propagate_in_place_output or propagate_in_place_input), we need to copy the data;
+                                // otherwise, we can skip the copy.
                                if (arg_tensors[i] < out_tensor ||
                                    arg_tensors[i] >=
                                        reinterpret_cast<char*>(out_tensor) + out_size)

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -620,41 +620,8 @@ using namespace ngraph::runtime;
            }
        }
-        // concat
+        // In place concatenation optimization
-        for (shared_ptr<Node> node : ordered_ops)
+        process_in_place_concat(ordered_ops);
-        {
-            if (auto concat = std::dynamic_pointer_cast<ngraph::op::Concat>(node))
-            {
-                if (auto op_annotations = concat->get_op_annotations())
-                {
-                    auto in_place_oi_pairs = op_annotations->get_in_place_oi_pairs();
-                    if (in_place_oi_pairs.size() > 0)
-                    {
-                        bool found_last_concat = true;
-                        for (auto user : concat->get_users())
-                        {
-                            if (dynamic_pointer_cast<ngraph::op::Concat>(user))
-                            {
-                                found_last_concat = false;
-                                break;
-                            }
-                        }
-                        if (found_last_concat)
-                        {
-                            for (auto arg : concat->get_arguments())
-                            {
-                                if (auto arg_concat = dynamic_pointer_cast<ngraph::op::Concat>(arg))
-                                {
-                                    NGRAPH_DEBUG << "call propagate_in_place_concat for "
-                                                 << arg->get_name() << std::endl;
-                                    propagate_in_place_concat(arg_concat);
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
        writer << "bool " << current_function->get_name() << "_t_en[" << tensor_index << "];\n";
@@ -1177,6 +1144,56 @@ void runtime::cpu::CPU_ExternalFunction::propagate_in_place_output(
    } while (propagate_further);
 }
+void runtime::cpu::CPU_ExternalFunction::process_in_place_concat(
+    std::list<std::shared_ptr<Node>> nodes)
+{
+    for (shared_ptr<Node> node : nodes)
+    {
+        if (auto concat = std::dynamic_pointer_cast<ngraph::op::Concat>(node))
+        {
+            if (auto op_annotations = concat->get_op_annotations())
+            {
+                auto in_place_oi_pairs = op_annotations->get_in_place_oi_pairs();
+                if (in_place_oi_pairs.size() > 0)
+                {
+                    bool found_last_concat = true;
+                    for (auto user : concat->get_users())
+                    {
+                        if (dynamic_pointer_cast<ngraph::op::Concat>(user))
+                        {
+                            found_last_concat = false;
+                            break;
+                        }
+                    }
+                    if (found_last_concat)
+                    {
+                        auto output_tensor = &concat->get_output_tensor();
+                        auto offset = output_tensor->get_pool_offset();
+                        for (auto arg : concat->get_arguments())
+                        {
+                            auto input_node = std::dynamic_pointer_cast<ngraph::op::Op>(arg);
+                            auto input_tensor = &input_node->get_output_tensor();
+                            auto old_offset = input_tensor->get_pool_offset();
+                            input_tensor->set_pool_offset(offset);
+                            NGRAPH_DEBUG << "cpu_external_function: change offset, old offset is "
+                                         << old_offset << ", new offset is " << offset << std::endl;
+                            offset += input_tensor->size();
+                            if (auto arg_concat = dynamic_pointer_cast<ngraph::op::Concat>(arg))
+                            {
+                                NGRAPH_DEBUG
+                                    << "cpu_external_function: call propagate_in_place_concat for "
+                                    << arg->get_name() << std::endl;
+                                propagate_in_place_concat(arg_concat);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
 void runtime::cpu::CPU_ExternalFunction::propagate_in_place_concat(
    shared_ptr<ngraph::op::Concat> concat)
 {
@@ -1268,54 +1285,8 @@ void runtime::cpu::CPU_ExternalFunction::build()
    // Build executor
-    // concat
+    // In place concatenation optimization
-    for (shared_ptr<Node> node : m_function->get_ordered_ops())
+    process_in_place_concat(m_function->get_ordered_ops());
-    {
-        if (auto concat = std::dynamic_pointer_cast<ngraph::op::Concat>(node))
-        {
-            if (auto op_annotations = concat->get_op_annotations())
-            {
-                auto in_place_oi_pairs = op_annotations->get_in_place_oi_pairs();
-                if (in_place_oi_pairs.size() > 0)
-                {
-                    bool found_last_concat = true;
-                    auto output_tensor = &concat->get_output_tensor();
-                    auto offset = output_tensor->get_pool_offset();
-                    for (auto arg : concat->get_arguments())
-                    {
-                        auto input_node = std::dynamic_pointer_cast<ngraph::op::Op>(arg);
-                        auto input_tensor = &input_node->get_output_tensor();
-                        auto old_offset = input_tensor->get_pool_offset();
-                        input_tensor->set_pool_offset(offset);
-                        NGRAPH_DEBUG << "cpu_external_function: change offset, old offset is "
-                                     << old_offset << ", new offset is " << offset << std::endl;
-                        offset += input_tensor->size();
-                    }
-                    for (auto user : concat->get_users())
-                    {
-                        if (dynamic_pointer_cast<ngraph::op::Concat>(user))
-                        {
-                            found_last_concat = false;
-                            break;
-                        }
-                    }
-                    if (found_last_concat)
-                    {
-                        for (auto arg : concat->get_arguments())
-                        {
-                            if (auto arg_concat = dynamic_pointer_cast<ngraph::op::Concat>(arg))
-                            {
-                                NGRAPH_DEBUG
-                                    << "cpu_external_function: call propagate_in_place_concat for "
-                                    << arg->get_name() << std::endl;
-                                propagate_in_place_concat(arg_concat);
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
    // Intermediates
    if (m_function->get_temporary_pool_size())

--- a/src/ngraph/runtime/cpu/cpu_external_function.hpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.hpp
@@ -158,7 +158,11 @@ namespace ngraph
                void propagate_in_place_output(ngraph::descriptor::Output* res_src_output,
                                               std::string output_name,
                                               bool dex);
-                // For a chain of concat ops, propagate pool offsets
+                // Find in-place concat ops and set appropriate memory pool offset for its arguments
+                void process_in_place_concat(std::list<std::shared_ptr<Node>> nodes);
+                // For a chain of concat ops, propagate memory pool offsets
                void propagate_in_place_concat(std::shared_ptr<ngraph::op::Concat> concat);
                bool computes_result(Node* node);

--- a/src/ngraph/runtime/cpu/pass/cpu_memory_optimization.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_memory_optimization.cpp
@@ -14,6 +14,40 @@
 // limitations under the License.
 //*****************************************************************************
+/// In-place-concat optimization makes the argument nodes of a concatenation node use the concatenation node's memory buffer
+/// for their outputs. As a result, we eliminate memory copies from the memory buffers of the argument nodes to
+/// that of the concatenation node. When there is a chain of in place concatenation nodes, we propagate the
+/// memory buffer starting from the last concatenation node. Not all concatenation nodes can be optimized. This pass
+/// marks all the nodes that can be optimized.
+///
+/// Example1:
+/// parameter1 parameter2        parameter3 parameter4        parameter5 parameter6
+///    \          /                 \          /                 \          /
+///         add1                        add2                         add3
+///           \                           |                            /
+///                                    concat
+///
+/// Before optimization: the result of add1 is stored to the memory buffer assigned to add1, same for add2 and add3;
+///                      then those results are copied to the memory buffer assigned to concat.
+/// After optimization: the result of add1 is stored to the memory buffer assigned to concat, same for add2 and add3.
+///                     there is no need to copy those results.
+///
+///
+/// Example2:
+/// parameter1 parameter2      parameter3 parameter4
+///    \          /               \          /
+///        add1                       add2
+///          \                         /
+///                     concat1                     parameter5
+///                      |     \                        /
+///                      |                 add3
+///                       \                 /
+///                               concat
+///
+/// After optimization: the result of add1 is stored to the memory buffer assigned to concat, same for add2 and add3.
+#include <cassert>
 #include "ngraph/runtime/cpu/pass/cpu_memory_optimization.hpp"
 #include "ngraph/descriptor/output.hpp"
@@ -54,6 +88,9 @@ bool runtime::cpu::pass::CPUMemoryOptimization::run_on_function(std::shared_ptr<
            auto index = 0;
            for (descriptor::Input& input : concat->get_inputs())
            {
+                // no tensors with zero-sized dimensions after zero_dim_tensor_elimination
+                assert(shape_size(input.get_shape()) != 0);
                // check if input layout is padded
                auto input_md = mkldnn_utils::get_input_mkldnn_md(n.get(), index);
                index++;
@@ -65,14 +102,6 @@ bool runtime::cpu::pass::CPUMemoryOptimization::run_on_function(std::shared_ptr<
                    break;
                }
-                if (shape_size(input.get_shape()) == 0)
-                {
-                    NGRAPH_DEBUG << "cpu_post_layout_assignment: 0 length tensor, no in "
-                                    "place concat";
-                    in_place_concat = false;
-                    break;
-                }
                const auto& output = input.get_output();
                auto arg = output.get_node();
                if (std::dynamic_pointer_cast<op::Constant>(arg) ||