[MLIR] Add sub-graph extraction support (#3101)

* Initial sub-graph extraction * Works without detaching input edges from sub-graph * Added removing input edges to graph * Works with whole func sub-graphs. Inputs edges to sub-graph are still there * Works on 2 exclusive sub-graphs. Still not on merged sub-graphs * Revert removing inputs to sub-graph. nGraph validation crashes * Added 3 sub-graph test. Remove compiled_kernel fusion pass. Comments * Revert some changes * Added cycle detection. Removed unit-tests to backend_mlir.in.cpp. Still not fully functional * Construct CK nodes after finding outputs to preserve the graph. * Fix topological sort. UTs pass. * Minor fixes * PR fixes * Enable mlir tests only when building with MLIR on

[MLIR] Add sub-graph extraction support (#3101)
* Initial sub-graph extraction * Works without detaching input edges from sub-graph * Added removing input edges to graph * Works with whole func sub-graphs. Inputs edges to sub-graph are still there * Works on 2 exclusive sub-graphs. Still not on merged sub-graphs * Revert removing inputs to sub-graph. nGraph validation crashes * Added 3 sub-graph test. Remove compiled_kernel fusion pass. Comments * Revert some changes * Added cycle detection. Removed unit-tests to backend_mlir.in.cpp. Still not fully functional * Construct CK nodes after finding outputs to preserve the graph. * Fix topological sort. UTs pass. * Minor fixes * PR fixes * Enable mlir tests only when building with MLIR on
f4b487a4 · Nagy Mostafa · Scott Cyphers · 4b009f09 · f4b487a4 · f4b487a4
Commit f4b487a4 authored Jul 11, 2019 by Nagy Mostafa Committed by Scott Cyphers Jul 11, 2019
10 changed files
--- a/src/contrib/mlir/pass/mlir_subgraph_extraction.cpp
+++ b/src/contrib/mlir/pass/mlir_subgraph_extraction.cpp
--- a/src/contrib/mlir/pass/mlir_subgraph_extraction.hpp
+++ b/src/contrib/mlir/pass/mlir_subgraph_extraction.hpp
@@ -16,26 +16,114 @@

 #pragma once

+#include <mutex>
 #include "ngraph/pass/pass.hpp"
-
 namespace ngraph
 {
    namespace pass
    {
-        /// This pass creates CompiledKernel ops enclosing sub-graphs that will be compiled and
-        /// executed by MLIR.
-        // TODO: WIP. Currently we only create a single CompiledKernel op for the whole function
-        // body.
+        /// This pass creates CompiledKernel ops enclosing maximal sub-graphs of ops that are supported by MLIR
        class MLIRSubgraphExtractionPass : public ngraph::pass::FunctionPass
        {
+            using NodeSet = std::unordered_set<std::shared_ptr<Node>>;
+
+            class MLIRSubgraph
+            {
+            private:
+                static int get_new_graph_id() { return m_curr_graph_id++; }
+                /// Create a sub-graph with a new ID.
+                MLIRSubgraph(MLIRSubgraphExtractionPass* pass)
+                    : m_graph_id(MLIRSubgraph::get_new_graph_id())
+                    , m_pass(*pass)
+                {
+                }
+
+            public:
+                /// Factory method to creates a new sub-graph with unique ID
+                static MLIRSubgraph create(MLIRSubgraphExtractionPass* pass)
+                {
+                    // mutex on global graph ID
+                    std::lock_guard<std::mutex> lock(pass->m_subgraph_mutex);
+                    return MLIRSubgraph(pass);
+                }
+                /// Get sub-graph id
+                int get_id() const { return m_graph_id; }
+                /// Get all nodes in the sub-graph.
+                NodeSet& get_nodes() { return m_nodes; }
+                /// Get input nodes. Predecessors to head nodes.
+                NodeSet& get_inputs() { return m_input_nodes; }
+                /// Get output nodes. Nodes in the sub-graph with edges to external nodes.
+                NodeSet& get_outputs() { return m_output_nodes; }
+                /// Add a list of input nodes to the sub-graph.
+                template <typename T>
+                void add_inputs(T& inputs);
+                /// Add a list of output nodes to the sub-graph.
+                template <typename T>
+                void add_outputs(T& outputs);
+                /// Merges sub-graph (other) into this sub-graph. other will be destroyed.
+                void merge(MLIRSubgraph& other);
+                /// Add one node to the sub-graph.
+                void add_node(std::shared_ptr<Node> node);
+
+            private:
+                // Unique ID for this sub-graph.
+                int m_graph_id;
+                // Actual nodes of the sub-graph
+                NodeSet m_nodes;
+                // Predecessor to head nodes in the sub-graph.
+                NodeSet m_input_nodes;
+                NodeSet m_output_nodes;
+                MLIRSubgraphExtractionPass& m_pass;
+                static int m_curr_graph_id;
+            };
+            friend class MLIRSubgraph;
+
        public:
            MLIRSubgraphExtractionPass() {}
            bool run_on_function(std::shared_ptr<Function> func) override;
            /// Checks if an ngraph node is supported by MLIR backend
            bool is_supported_mlir_op(std::shared_ptr<Node> node);
+            /// Get the sub-graph ID that a node belongs to
+            int get_subgraph_id(std::shared_ptr<Node> node)
+            {
+                auto it = m_node_to_graph.find(node);
+                return (it == m_node_to_graph.end()) ? -1 : it->second;
+            }
+            /// Get sub-graph by ID
+            MLIRSubgraph& get_subgraph(int id)
+            {
+                auto it = m_id_to_graph.find(id);
+                NGRAPH_CHECK(it != m_id_to_graph.end(), "Cannot find subgraph with ID: ", id);
+                return it->second;
+            }
+            /// Stores a sub-graph in the map
+            void add_subgraph(MLIRSubgraph& sg) { m_id_to_graph.emplace(sg.get_id(), sg); }
+            /// Checks if adding a node to an extracted sub-graph will cause a DAG cycle
+            /// inputs: the list of input nodes outside sub-graphs to the node we want to add.
+            /// subgraph_ids: the sub-graphs the predecessor nodes belong to.
+            /// It traverses backwards from all input nodes and checks if we reach any node that already
+            /// belongs to one of the sub-graph ids. If so, we have a cycle.
+            ///
+            /// Example:
+            /// A(1)
+            /// |   \
+            /// B(1) C
+            /// |  /
+            /// D
+            /// we want to add D to sub-graph 1. C is an input to D. sugraph_ids are 1
+            /// we traverse backwards C->A(1) and find 1, then we cannot add D since we will form a cycle
+            bool check_cycles(NodeVector& inputs, std::unordered_set<int>& subgraph_ids);

        private:
            static const std::set<std::type_index> m_supported_ops;
+
+        private:
+            using IDGraphMap = std::unordered_map<int, MLIRSubgraph>;
+            using NodeGraphMap = std::unordered_map<std::shared_ptr<Node>, int>;
+            IDGraphMap m_id_to_graph;
+            NodeGraphMap m_node_to_graph;
+            // Mutex over sub-graph IDs
+            std::mutex m_subgraph_mutex;
        };
    }
 }
--- a/src/ngraph/graph_util.cpp
+++ b/src/ngraph/graph_util.cpp
@@ -463,7 +463,8 @@ bool ngraph::is_one(std::shared_ptr<Node> reduce_constant)

 NodeVector ngraph::get_subgraph_outputs(const NodeVector& nodes,
                                        const NodeVector& exclusions,
-                                        bool ignore_unused)
+                                        bool ignore_unused,
+                                        bool ignore_output_duplicates)
 {
    std::set<shared_ptr<Node>> exclusions_set(exclusions.begin(), exclusions.end());
    std::set<shared_ptr<Node>> nodes_set(nodes.begin(), nodes.end());
@@ -479,7 +480,11 @@ NodeVector ngraph::get_subgraph_outputs(const NodeVector& nodes,

        for (const auto& u : n->get_users())
        {
-            if (nodes_set.count(u) == 0 && (!ignore_unused || is_used(u.get())))
+            bool add_output = nodes_set.count(u) == 0 && (!ignore_unused || is_used(u.get()));
+            // check if output is already captured
+            add_output &= (ignore_output_duplicates ||
+                           std::find(outputs.begin(), outputs.end(), n) == outputs.end());
+            if (add_output)
            {
                outputs.push_back(n);
            }

--- a/src/ngraph/graph_util.hpp
+++ b/src/ngraph/graph_util.hpp
@@ -275,7 +275,8 @@ namespace ngraph

    NodeVector get_subgraph_outputs(const NodeVector& nodes,
                                    const NodeVector& exclusions,
-                                    bool ignore_unused = false);
+                                    bool ignore_unused = false,
+                                    bool ignore_output_duplicates = true);

    // Extract sub-graph computing the `results`. Stops backward traversal at either a Parameter node
    // or a node that belongs to args

--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -114,7 +114,6 @@ set(SRC
    op/update_slice.cpp
    pass/cpu_assignment.cpp
    pass/cpu_collapse_dims.cpp
-    pass/cpu_compiled_kernel_fusion.cpp
    pass/cpu_fusion.cpp
    pass/cpu_horizontal_fusion.cpp
    pass/cpu_layout.cpp

--- a/src/ngraph/runtime/cpu/pass/cpu_compiled_kernel_fusion.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_compiled_kernel_fusion.cpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#include <algorithm>
-#include <iostream>
-#include <map>
-#include <memory>
-
-#include "ngraph/graph_util.hpp"
-#include "ngraph/log.hpp"
-#include "ngraph/op/abs.hpp"
-#include "ngraph/op/add.hpp"
-#include "ngraph/op/experimental/compiled_kernel.hpp"
-#include "ngraph/op/get_output_element.hpp"
-#include "ngraph/op/maximum.hpp"
-#include "ngraph/op/minimum.hpp"
-#include "ngraph/op/negative.hpp"
-#include "ngraph/op/relu.hpp"
-#include "ngraph/op/subtract.hpp"
-#include "ngraph/op/util/binary_elementwise_arithmetic.hpp"
-#include "ngraph/op/util/unary_elementwise_arithmetic.hpp"
-#include "ngraph/runtime/cpu/pass/cpu_compiled_kernel_fusion.hpp"
-
-#define TI(x) std::type_index(typeid(x))
-
-using namespace ngraph;
-
-struct LKGraph
-{
-    LKGraph(const NodeVector& ns, const NodeVector& ins)
-        : m_inputs(ins)
-        , m_nodes(ns)
-    {
-    }
-    NodeVector m_inputs;
-    NodeVector m_nodes;
-};
-
-class CompiledKernelCollector
-{
-public:
-    CompiledKernelCollector(std::shared_ptr<Function> f, size_t min_nodes_to_fuse)
-    {
-        for (auto n : f->get_ordered_ops())
-        {
-            if (is_fusible(n))
-            {
-                auto arg_from_fusible_group = collect_fusible_args(n);
-                // create a new group
-                if (!arg_from_fusible_group)
-                {
-                    m_heads.insert(std::make_pair(n, n));
-                    m_graphs.insert(std::make_pair(n, LKGraph{{n}, n->get_arguments()}));
-                    NGRAPH_DEBUG << "Created a new group for " << n->get_name();
-                    log_group(n);
-                }
-                else
-                {
-                    auto smallest_head = m_heads.at(arg_from_fusible_group);
-                    auto& ckgraph = m_graphs.at(smallest_head);
-                    ckgraph.m_nodes.push_back(n);
-                    for (auto arg : n->get_arguments())
-                    {
-                        if (is_leaf(arg))
-                        {
-                            ckgraph.m_inputs.push_back(arg);
-                        }
-                    }
-                    m_heads.insert(std::make_pair(n, smallest_head));
-                    log_group(smallest_head);
-                }
-            }
-        }
-
-        prune_graphs(min_nodes_to_fuse);
-    }
-
-    const std::vector<std::shared_ptr<op::CompiledKernel>> get_compiled_kernels() const
-    {
-        std::vector<std::shared_ptr<op::CompiledKernel>> cks;
-        for (auto e : m_graphs)
-        {
-            auto& ckg = e.second;
-            NodeVector member_outputs = ngraph::get_subgraph_outputs(ckg.m_nodes, NodeVector{});
-            auto ck =
-                std::make_shared<op::CompiledKernel>(ckg.m_nodes, member_outputs, ckg.m_inputs);
-            cks.push_back(ck);
-        }
-        return cks;
-    }
-
-private:
-    static bool is_fusible(std::shared_ptr<Node> n)
-    {
-        static const std::set<std::type_index> fusible_ops_set{TI(ngraph::op::Abs),
-                                                               TI(ngraph::op::Add),
-                                                               TI(ngraph::op::Negative),
-                                                               TI(ngraph::op::Subtract),
-                                                               TI(ngraph::op::Relu),
-                                                               TI(ngraph::op::Minimum),
-                                                               TI(ngraph::op::Maximum)};
-
-        const Node& node = *n;
-        return fusible_ops_set.count(TI(node)) != 0;
-
-        // return (std::dynamic_pointer_cast<op::util::BinaryElementwiseArithmetic>(n) ||
-        //         std::dynamic_pointer_cast<op::util::UnaryElementwiseArithmetic>(n));
-    }
-
-    bool is_leaf(std::shared_ptr<Node> src) { return src->is_parameter() || src->is_constant(); }
-    void prune_graphs(size_t min_nodes_to_fuse)
-    {
-        for (auto it = m_graphs.begin(); it != m_graphs.end();)
-        {
-            if (it->second.m_nodes.size() < min_nodes_to_fuse)
-            {
-                it = m_graphs.erase(it);
-            }
-            else
-            {
-                it++;
-            }
-        }
-    }
-
-    void log_group(std::shared_ptr<Node> head) const
-    {
-        NGRAPH_DEBUG << "Group leader : " << head->get_name() << std::endl;
-        NGRAPH_DEBUG << "Group members : " << m_graphs.at(head).m_nodes << std::endl;
-        NGRAPH_DEBUG << "Inputs: " << m_graphs.at(head).m_inputs << std::endl;
-    }
-
-    std::shared_ptr<Node> collect_fusible_args(std::shared_ptr<Node> n)
-    {
-        std::shared_ptr<Node> arg_from_fusible_group;
-        for (auto arg : n->get_arguments())
-        {
-            // an argument is fusible and a part of some group
-            NGRAPH_DEBUG << "Considering " << arg->get_name();
-            if (m_heads.count(arg) != 0)
-            {
-                if (!arg_from_fusible_group)
-                {
-                    arg_from_fusible_group = arg;
-                }
-                else
-                {
-                    if (!is_leaf(arg) && m_heads.at(arg) != m_heads.at(arg_from_fusible_group))
-                    {
-                        return {nullptr};
-                    }
-                }
-            }
-        }
-        return arg_from_fusible_group;
-    }
-
-    std::unordered_map<std::shared_ptr<Node>, LKGraph> m_graphs;
-    std::unordered_map<std::shared_ptr<Node>, std::shared_ptr<Node>> m_heads;
-};
-
-bool ngraph::runtime::cpu::pass::CPUCompiledKernelFusion::run_on_function(
-    std::shared_ptr<ngraph::Function> function)
-{
-    CompiledKernelCollector ckc(function, m_min_kernel_size);
-    auto compiled_kernels = ckc.get_compiled_kernels();
-
-    for (auto ck : compiled_kernels)
-    {
-        auto outputs = ck->get_kernel_outputs();
-        std::set<std::shared_ptr<Node>> ck_nodes_set(ck->get_node_list().begin(),
-                                                     ck->get_node_list().end());
-        for (size_t i = 0; i < outputs.size(); i++)
-        {
-            auto ith_goe = std::make_shared<ngraph::op::GetOutputElement>(ck, i);
-            auto& ith_output = ith_goe->get_outputs().at(0);
-
-            if (outputs.at(i)->get_outputs().size() > 1)
-            {
-                throw ngraph_error(
-                    "support for fusing multi-output nodes in loop kernels isn't yet implemented");
-            }
-
-            // TODO: revisit when we need support for multi-output nodes
-            auto& orig_output = outputs.at(i)->get_outputs().at(0);
-
-            // this is needed since replace_output modifies orig_output.get_inputs()
-            std::set<ngraph::descriptor::Input*> inputs_copy{begin(orig_output.get_inputs()),
-                                                             end(orig_output.get_inputs())};
-            for (auto input : inputs_copy)
-            {
-                // this user is NOT internal to this loop kernel
-                // so it needs to be replaced with corresponding ck's GOE
-                if (ck_nodes_set.count(input->get_node()) == 0)
-                {
-                    input->replace_output(ith_output);
-                }
-            }
-        }
-    }
-
-    return !compiled_kernels.empty();
-}
--- a/src/ngraph/runtime/cpu/pass/cpu_compiled_kernel_fusion.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_compiled_kernel_fusion.hpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#pragma once
-
-#include "ngraph/pass/pass.hpp"
-
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace cpu
-        {
-            namespace pass
-            {
-                class CPUCompiledKernelFusion : public ngraph::pass::FunctionPass
-                {
-                public:
-                    CPUCompiledKernelFusion(size_t min_kernel_size = 2)
-                        : FunctionPass()
-                        , m_min_kernel_size(min_kernel_size)
-                    {
-                    }
-
-                    bool run_on_function(std::shared_ptr<ngraph::Function> function) override;
-
-                protected:
-                    size_t m_min_kernel_size;
-                };
-            }
-        }
-    }
-}
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -172,6 +172,10 @@ set(MULTI_TEST_SRC
    dynamic.in.cpp
 )

+if (NGRAPH_MLIR_ENABLE)
+    list(APPEND MULTI_TEST_SRC backend_mlir.in.cpp)
+endif()
+
 if(NGRAPH_DISTRIBUTED_ENABLE)
    list(APPEND MULTI_TEST_SRC distributed.in.cpp)
 endif()

--- a/test/backend_binary_elementwise.in.cpp
+++ b/test/backend_binary_elementwise.in.cpp
@@ -67,36 +67,6 @@ NGRAPH_TEST(${BACKEND_NAME}, add)
                                  (test::NDArray<float, 2>({{6, 8}, {10, 12}})).get_vector()));
 }

-NGRAPH_TEST(${BACKEND_NAME}, dot_add)
-{
-    Shape shape_in1{2, 3};
-    Shape shape_in2{3, 3};
-    Shape shape_out{2, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape_in1);
-    auto B = make_shared<op::Parameter>(element::f32, shape_in2);
-    auto dot = make_shared<op::Dot>(A, B);
-    auto C = make_shared<op::Parameter>(element::f32, shape_out);
-    auto add = make_shared<op::Add>(dot, C);
-    auto f = make_shared<Function>(add, ParameterVector{A, B, C});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape_in1);
-    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape_in2);
-    shared_ptr<runtime::Tensor> c = backend->create_tensor(element::f32, shape_out);
-    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shape_out);
-
-    copy_data(a, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
-    copy_data(b, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f});
-    copy_data(c, vector<float>{5.f, 4.f, 3.f, 2.f, 1.f, 0.f});
-
-    auto handle = backend->compile(f);
-    handle->call_with_validate({result}, {a, b, c});
-    EXPECT_TRUE(test::all_close_f(read_vector<float>(result),
-                                  vector<float>{35.f, 40.f, 45.f, 68.f, 82.f, 96.f}));
-}
-
 NGRAPH_TEST(${BACKEND_NAME}, add_overload)
 {
    Shape shape{2, 2};

--- a/test/backend_mlir.in.cpp
+++ b/test/backend_mlir.in.cpp