IAT: Collapse dimensions around arithmetic reduction operations (#1763)

* Collapse dimensions for arithmetic reduction ops to support faster kernels * Propagate in-place constants and allow in-place reshapes for more cases * style fix * Additional checks for parameter and constant to help backends that dont propagate in-place parameter and constant inputs * Allow non-destructive pass through onlyu if memory sharing is disabled * Address PR feedback * Bug fix for collapse dimensions in case of null reduction

IAT: Collapse dimensions around arithmetic reduction operations (#1763)
* Collapse dimensions for arithmetic reduction ops to support faster kernels * Propagate in-place constants and allow in-place reshapes for more cases * style fix * Additional checks for parameter and constant to help backends that dont propagate in-place parameter and constant inputs * Allow non-destructive pass through onlyu if memory sharing is disabled * Address PR feedback * Bug fix for collapse dimensions in case of null reduction
e765956a · Jayaram Bobba · Scott Cyphers · 1beec46b · e765956a · e765956a
Commit e765956a authored Oct 22, 2018 by Jayaram Bobba Committed by Scott Cyphers Oct 22, 2018
5 changed files
--- a/src/ngraph/pass/memory_layout.cpp
+++ b/src/ngraph/pass/memory_layout.cpp
@@ -51,8 +51,10 @@ bool pass::MemoryLayout::run_on_function(shared_ptr<ngraph::Function> function)
                    auto input = &node->get_inputs().at(oi_pair.input).get_tensor();
                    auto input_node = node->get_inputs().at(oi_pair.input).get_output().get_node();
-                    // an input tensor can be reused if this is the last use
+                    // For destructive kernel, this should be the last use
-                    if (node->liveness_free_list.count(input) != 0 &&
+                    // Non-destructive kernels can pass through if memory sharing is disabled
+                    if ((node->liveness_free_list.count(input) != 0 ||
+                         (m_disable_memory_sharing && !oi_pair.destructive)) &&
                        node->liveness_new_list.count(output) != 0)
                    {
                        in_place_outputs.insert({output, input});

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -551,7 +551,7 @@ using namespace ngraph::runtime;
    {
        for (shared_ptr<Node> node : function_ordered_ops.at(current_function))
        {
-            const ngraph::op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
+            ngraph::op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
            if (c)
            {
                m_active_constants.push_back(node);
@@ -677,6 +677,15 @@ using namespace ngraph::runtime;
                      "(*(ctx->G), [&](const tbb::flow::continue_msg &msg)\n{});\n";
        }
+        for (shared_ptr<Node> node : ordered_ops)
+        {
+            if (dynamic_cast<ngraph::op::Constant*>(node.get()))
+            {
+                shared_ptr<descriptor::Tensor> tv = node->get_outputs()[0].get_tensor_ptr();
+                propagate_in_place_constant(&node->get_outputs().at(0), tv->get_name(), false);
+            }
+        }
        // Add inputs to the variable name map
        size_t arg_index = 0;
        for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
@@ -1102,6 +1111,53 @@ void runtime::cpu::CPU_ExternalFunction::propagate_in_place_input(
    }
 }
+void runtime::cpu::CPU_ExternalFunction::propagate_in_place_constant(
+    ngraph::descriptor::Output* output, std::string input_name, bool dex)
+{
+    std::deque<ngraph::descriptor::Output*> stack;
+    stack.push_front(output);
+    while (stack.size() > 0)
+    {
+        ngraph::descriptor::Output* it = stack.front();
+        stack.pop_front();
+        for (auto input : it->get_inputs())
+        {
+            auto c_op = std::dynamic_pointer_cast<ngraph::op::Op>(input->get_node());
+            if (!c_op || c_op->is_output())
+            {
+                continue;
+            }
+            if (auto op_annotations = c_op->get_op_annotations())
+            {
+                for (auto oi_pair : op_annotations->get_in_place_oi_pairs())
+                {
+                    if (oi_pair.input == input->get_index() && !oi_pair.destructive)
+                    {
+                        size_t output_index = oi_pair.output;
+                        auto& output_tensor = c_op->get_outputs().at(output_index).get_tensor();
+                        if (dex)
+                        {
+                            tensor_alias[output_tensor.get_name()] = input_name;
+                        }
+                        else
+                        {
+                            m_variable_name_map[output_tensor.get_name()] = input_name;
+                        }
+                        m_tensor_roles[output_tensor.get_name()] = CPUTensorRole::CONSTANT;
+                        NGRAPH_DEBUG << " CPU: Forwarding " << input_name << " through "
+                                     << output_tensor.get_name();
+                        stack.push_back(&c_op->get_outputs().at(output_index));
+                    }
+                }
+            }
+        }
+    }
+}
 void runtime::cpu::CPU_ExternalFunction::propagate_in_place_output(
    ngraph::descriptor::Output* res_src_output, std::string output_name, bool dex)
 {
@@ -1239,6 +1295,7 @@ void runtime::cpu::CPU_ExternalFunction::build()
            tensor_data[tv->get_name()] =
                const_cast<void*>(static_pointer_cast<ngraph::op::Constant>(node)->get_data_ptr());
            m_tensor_roles[tv->get_name()] = CPUTensorRole::CONSTANT;
+            propagate_in_place_constant(&node->get_outputs().at(0), tv->get_name(), true);
        }
    }

--- a/src/ngraph/runtime/cpu/cpu_external_function.hpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.hpp
@@ -174,6 +174,11 @@ namespace ngraph
                // Register passes that are common to codegen and DEX
                void register_common_passes(ngraph::pass::Manager& pass_manager);
+                // For non-destructive passthrough kernels, propagate function
+                // constant buffers to internal ops
+                void propagate_in_place_constant(ngraph::descriptor::Output* output,
+                                                 std::string input_name,
+                                                 bool dex);
                // For non-destructive passthrough kernels, propagate function
                // input buffers to internal ops
                void propagate_in_place_input(ngraph::descriptor::Output* output,

--- a/src/ngraph/runtime/cpu/pass/cpu_collapse_dims.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_collapse_dims.cpp
--- a/test/cpu_test.cpp
+++ b/test/cpu_test.cpp
@@ -478,3 +478,37 @@ TEST(cpu_test, reshape_layout_optimizations7)
    }
    EXPECT_EQ(count_ops_of_type<runtime::cpu::op::ConvertLayout>(cpu_f), 0);
 }
+TEST(cpu_test, collapse_dims1)
+{
+    // Expand multiple dimensions. Ensure no extra conversions downstream
+    auto make_function = []() -> std::shared_ptr<Function> {
+        auto A = make_shared<op::Parameter>(element::f32, Shape{1, 4, 10, 6, 10});
+        auto sum1 = make_shared<op::Sum>(A, AxisVector{1});    // Shape{1, 10, 6, 10}
+        auto sum2 = make_shared<op::Sum>(sum1, AxisVector{0}); // Shape{10, 6, 10}
+        return make_shared<Function>(NodeVector{sum2}, op::ParameterVector{A});
+    };
+    auto backend = runtime::Backend::create("CPU");
+    auto cpu_f = make_function();
+    auto int_f = make_function();
+    test::Uniform<float> rng(-100.0f, 100.0f);
+    vector<vector<float>> args;
+    for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+    for (size_t i = 0; i < cpu_results.size(); i++)
+    {
+        EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i)));
+    }
+    // sum1 will have two reshapes added around it. sum2 will be replaced
+    // with a reshape
+    EXPECT_EQ(count_ops_of_type<op::Reshape>(cpu_f), 3);
+}