collapse dims (#1423)

* Collapse dims when possible to facilitate faster kernels * Minor debug change * Define and track tensor roles through graph optimizations

collapse dims (#1423)
* Collapse dims when possible to facilitate faster kernels * Minor debug change * Define and track tensor roles through graph optimizations
e63ffa29 · Jayaram Bobba · Robert Kimball · 04e56c64 · e63ffa29 · e63ffa29
Commit e63ffa29 authored Aug 20, 2018 by Jayaram Bobba Committed by Robert Kimball Aug 20, 2018
4 changed files
--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -80,13 +80,14 @@ set(SRC
    op/rnn.cpp
    op/sigmoid_mul.cpp
    pass/cpu_assignment.cpp
+    pass/cpu_collapse_dims.cpp
    pass/cpu_concat_inputs.cpp
    pass/cpu_fusion.cpp
    pass/cpu_layout.cpp
+    pass/cpu_loop_kernel_fusion.cpp
+    pass/cpu_mat_fusion.cpp
    pass/cpu_post_layout_optimizations.cpp
    pass/cpu_rnn_fusion.cpp
-    pass/cpu_mat_fusion.cpp
-    pass/cpu_loop_kernel_fusion.cpp
    pass/cpu_workspace_insertion.cpp
 )


--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -149,6 +149,7 @@
 #include "ngraph/runtime/cpu/op/sigmoid.hpp"
 #include "ngraph/runtime/cpu/op/sigmoid_mul.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_assignment.hpp"
+#include "ngraph/runtime/cpu/pass/cpu_collapse_dims.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_concat_inputs.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_fusion.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_layout.hpp"
@@ -378,6 +379,7 @@ void runtime::cpu::CPU_ExternalFunction::compile()
    pass_manager.register_pass<ngraph::pass::CommonSubexpressionElimination>();
    pass_manager.register_pass<ngraph::pass::CoreFusion>();
    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
+    pass_manager.register_pass<runtime::cpu::pass::CPUCollapseDims>();
    pass_manager.register_pass<runtime::cpu::pass::CPUWorkspaceInsertion>(nv_cwi);
    pass_manager.register_pass<runtime::cpu::pass::CPUAssignment>(this);
    pass_manager.register_pass<runtime::cpu::pass::CPULayout>(this);
@@ -1128,6 +1130,7 @@ void runtime::cpu::CPU_ExternalFunction::build()
    pass_manager.register_pass<ngraph::pass::CommonSubexpressionElimination>();
    pass_manager.register_pass<ngraph::pass::CoreFusion>();
    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
+    pass_manager.register_pass<runtime::cpu::pass::CPUCollapseDims>();
    pass_manager.register_pass<runtime::cpu::pass::CPUWorkspaceInsertion>(nv_cwi);
    pass_manager.register_pass<runtime::cpu::pass::CPUAssignment>(this);
    pass_manager.register_pass<runtime::cpu::pass::CPULayout>(this);

--- a/src/ngraph/runtime/cpu/pass/cpu_collapse_dims.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_collapse_dims.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "cpu_collapse_dims.hpp"
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <unordered_set>
+#include "ngraph/graph_util.hpp"
+#include "ngraph/log.hpp"
+#include "ngraph/op/broadcast.hpp"
+#include "ngraph/op/reshape.hpp"
+
+using namespace ngraph;
+
+static void get_default_order(std::vector<size_t>& order, size_t rank)
+{
+    for (size_t i = 0; i < rank; i++)
+    {
+        order.push_back(i);
+    }
+}
+
+struct CollapsedDims
+{
+    std::vector<size_t> output_shape;
+    std::vector<bool> is_operated_axis;
+    std::vector<size_t> axis_set;
+    std::vector<size_t> input_shape;
+};
+
+// Fold and collapse axes of output_shape.
+// Contiguous axes that are not being operated on can be collapsed.
+// Contiguous axes that are being operated on are collapsed optionally.
+// Skip size 1 dimensions.
+static void collapse_dims(std::vector<size_t>& output_shape,
+                          std::set<size_t> operated_axes,
+                          struct CollapsedDims& cdims,
+                          bool collapse_operated_axes)
+{
+    size_t collapse_size = 1;
+    bool operated_axes_run = false;
+    bool collapsing = false;
+    for (int output_idx = static_cast<int>(output_shape.size()) - 1; output_idx >= 0; output_idx--)
+    {
+        auto is_operated_axis = operated_axes.count(output_idx) == 1;
+        auto end_run = (operated_axes_run != is_operated_axis) ||
+                       (is_operated_axis && !collapse_operated_axes);
+        if (collapsing && end_run)
+        {
+            if (collapse_size != 1)
+            {
+                cdims.output_shape.push_back(collapse_size);
+                cdims.is_operated_axis.push_back(operated_axes_run);
+                collapse_size = 1;
+            }
+        }
+
+        collapse_size *= output_shape[output_idx];
+        operated_axes_run = is_operated_axis;
+        collapsing = true;
+    }
+    // Last run
+    if (collapse_size != 1)
+    {
+        cdims.output_shape.push_back(collapse_size);
+        cdims.is_operated_axis.push_back(operated_axes_run);
+    }
+    std::reverse(cdims.output_shape.begin(), cdims.output_shape.end());
+    std::reverse(cdims.is_operated_axis.begin(), cdims.is_operated_axis.end());
+
+    for (size_t i = 0; i < cdims.is_operated_axis.size(); i++)
+    {
+        if (cdims.is_operated_axis[i])
+        {
+            cdims.axis_set.push_back(i);
+        }
+        else
+        {
+            cdims.input_shape.push_back(cdims.output_shape[i]);
+        }
+    }
+}
+
+bool runtime::cpu::pass::CPUCollapseDims::run_on_function(std::shared_ptr<ngraph::Function> f)
+{
+    bool replaced = false;
+    for (auto n : f->get_ordered_ops())
+    {
+        if (std::dynamic_pointer_cast<op::Broadcast>(n))
+        {
+            auto node = std::dynamic_pointer_cast<op::Broadcast>(n).get();
+            auto input_shape = node->get_argument(0)->get_shape();
+            auto output_shape = node->get_shape();
+            auto operated_axes = node->get_broadcast_axes();
+
+            struct CollapsedDims cdims;
+
+            collapse_dims(output_shape, operated_axes, cdims, true);
+
+            if (cdims.axis_set.size() == 0)
+            {
+                // Null broadcast operation, replace with reshape
+                AxisVector axis_order;
+                get_default_order(axis_order, input_shape.size());
+                auto reshape = std::make_shared<op::Reshape>(
+                    node->get_argument(0), axis_order, Shape(cdims.output_shape));
+                ngraph::replace_node(n, reshape);
+                replaced = true;
+            }
+            else if (output_shape.size() != cdims.output_shape.size())
+            {
+                // Reshape arg to collapsed input_shape
+                AxisVector input_axis_order;
+                get_default_order(input_axis_order, input_shape.size());
+                auto reshape_input = std::make_shared<op::Reshape>(
+                    node->get_argument(0), input_axis_order, Shape(cdims.input_shape));
+
+                auto broadcast = std::make_shared<op::Broadcast>(
+                    reshape_input, Shape(cdims.output_shape), AxisSet(cdims.axis_set));
+
+                // Reshape collapsed output to original output_shape
+                AxisVector output_axis_order;
+                get_default_order(output_axis_order, cdims.output_shape.size());
+                auto reshape_output =
+                    std::make_shared<op::Reshape>(broadcast, output_axis_order, output_shape);
+                ngraph::replace_node(n, reshape_output);
+                replaced = true;
+            }
+
+            if (replaced)
+            {
+                NGRAPH_DEBUG << "CollapseDims: Replaced broadcast " << input_shape << " "
+                             << operated_axes << " " << output_shape << " with "
+                             << Shape(cdims.input_shape) << " " << AxisSet(cdims.axis_set) << " "
+                             << Shape(cdims.output_shape);
+            }
+        }
+    }
+
+    return replaced;
+}
--- a/src/ngraph/runtime/cpu/pass/cpu_shuffle_folding.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_shuffle_folding.hpp
@@ -26,7 +26,7 @@ namespace ngraph
        {
            namespace pass
            {
-                class CPUShuffleFolding : public ngraph::pass::FunctionPass
+                class CPUCollapseDims : public ngraph::pass::FunctionPass
                {
                public:
                    bool run_on_function(std::shared_ptr<ngraph::Function> function) override;