[MLIR] Add sub-graph extraction support (#3101)

* Initial sub-graph extraction * Works without detaching input edges from sub-graph * Added removing input edges to graph * Works with whole func sub-graphs. Inputs edges to sub-graph are still there * Works on 2 exclusive sub-graphs. Still not on merged sub-graphs * Revert removing inputs to sub-graph. nGraph validation crashes * Added 3 sub-graph test. Remove compiled_kernel fusion pass. Comments * Revert some changes * Added cycle detection. Removed unit-tests to backend_mlir.in.cpp. Still not fully functional * Construct CK nodes after finding outputs to preserve the graph. * Fix topological sort. UTs pass. * Minor fixes * PR fixes * Enable mlir tests only when building with MLIR on

[MLIR] Add sub-graph extraction support (#3101)
* Initial sub-graph extraction * Works without detaching input edges from sub-graph * Added removing input edges to graph * Works with whole func sub-graphs. Inputs edges to sub-graph are still there * Works on 2 exclusive sub-graphs. Still not on merged sub-graphs * Revert removing inputs to sub-graph. nGraph validation crashes * Added 3 sub-graph test. Remove compiled_kernel fusion pass. Comments * Revert some changes * Added cycle detection. Removed unit-tests to backend_mlir.in.cpp. Still not fully functional * Construct CK nodes after finding outputs to preserve the graph. * Fix topological sort. UTs pass. * Minor fixes * PR fixes * Enable mlir tests only when building with MLIR on
f4b487a4 · Nagy Mostafa · Scott Cyphers · 4b009f09 · f4b487a4 · f4b487a4
Commit f4b487a4 authored Jul 11, 2019 by Nagy Mostafa Committed by Scott Cyphers Jul 11, 2019
10 changed files
--- a/src/contrib/mlir/pass/mlir_subgraph_extraction.cpp
+++ b/src/contrib/mlir/pass/mlir_subgraph_extraction.cpp
@@ -17,6 +17,7 @@
 #include "mlir_subgraph_extraction.hpp"
 #include "ngraph/assertion.hpp"
 #include "ngraph/graph_util.hpp"
+#include "ngraph/node.hpp"
 #include "ngraph/op/add.hpp"
 #include "ngraph/op/argmax.hpp"
 #include "ngraph/op/argmin.hpp"
@@ -31,43 +32,209 @@ using namespace ngraph::pass;

 #define TI(x) std::type_index(typeid(x))

+int MLIRSubgraphExtractionPass::MLIRSubgraph::m_curr_graph_id = 0;
+
+template <typename T>
+void MLIRSubgraphExtractionPass::MLIRSubgraph::add_inputs(T& inputs)
+{
+    // inputs list are not exclusive, avoid duplication
+    for (auto node : inputs)
+    {
+        if (m_input_nodes.find(node) == m_input_nodes.end())
+        {
+            m_input_nodes.insert(node);
+        }
+    }
+}
+
+template <typename T>
+void MLIRSubgraphExtractionPass::MLIRSubgraph::add_outputs(T& outputs)
+{
+    m_output_nodes.insert(outputs.begin(), outputs.end());
+}
+
+void MLIRSubgraphExtractionPass::MLIRSubgraph::add_node(std::shared_ptr<Node> node)
+{
+    NGRAPH_CHECK(m_nodes.find(node) == m_nodes.end(), "node added to graph before");
+    m_nodes.insert(node);
+    m_pass.m_node_to_graph[node] = get_id();
+}
+
+void MLIRSubgraphExtractionPass::MLIRSubgraph::merge(MLIRSubgraph& sg2)
+{
+    NGRAPH_CHECK(&sg2 != this, "Cannot merge a sub-graph into itself");
+
+    // Associate nodes of second sub-graph to first one
+    auto sg_nodes = sg2.get_nodes();
+    auto& node_map = m_pass.m_node_to_graph;
+    for (auto node : sg_nodes)
+    {
+        NGRAPH_DEBUG << *node;
+        NGRAPH_CHECK(m_pass.get_subgraph_id(node) == sg2.get_id(),
+                     "Node does not belong to sub-graph");
+        m_pass.m_node_to_graph[node] = get_id();
+    }
+
+    // nodes  of sub-graphs are exclusive
+    m_nodes.insert(sg2.get_nodes().begin(), sg2.get_nodes().end());
+    // merge inputs
+    add_inputs(sg2.get_inputs());
+
+    // Remove sub-graph from map
+    m_pass.m_id_to_graph.erase(sg2.get_id());
+}
+
+// The sub-graph construction algorithm is as follows
+// For each node, check its predecessors, if
+// - all predecessors in sub-graphs belong to the same sub-graph (graph ID), then extend the sub-graph to include the current node.
+//   Predecessors outside sub-graphs are marked as input to the sub-graph.
+// - predecessors in sub-graphs belong to different sub-graphs, then merge all the sub-graphs into one, and add current node to it.
+//   Predecessors outside sub-graphs are marked as input to the sub-graph.
+//
+// If the node has any external inputs, then it's possible that the input may come from one of the predecessor sub-graphs (cycle).
+// If a cycle is found, always start a new sub-graph.
+//
+// For each sub-graph found build a CompiledKernel(CK) node around it as follows
+// - all inputs edges to the sub-graph are cloned as inputs to CK node as well.
+// - all outputs edges from the sub-graph are removed and added as outputs to CK node instead.
+// - CK will internally have lists record graph nodes, and graph output nodes.
 bool MLIRSubgraphExtractionPass::run_on_function(std::shared_ptr<Function> func)
 {
-    // Create a CompiledKernel for all the ops in the function, except Parameters and Results.
-    NodeVector ck_ops;
+    NGRAPH_DEBUG << "[CK Extract] Construct sub-graphs" << std::endl;
    for (auto op : func->get_ordered_ops())
    {
-        // All ops must be supported by MLIR compiler
+        NodeVector inputs;
+        int first_graph_id = -1;
+        std::unordered_set<int> subgraph_ids;
+        // unsupported ops, skip
        if (!is_supported_mlir_op(op))
        {
-            return false;
+            continue;
+        }
+        if (TI(Parameter) == TI(*op) || TI(Result) == TI(*op))
+        {
+            continue;
        }

-        if (TI(Parameter) != TI(*op) && TI(Result) != TI(*op))
+        NGRAPH_DEBUG << "[CK Extract] Processing " << *op << std::endl;
+        // supported op
+        for (auto pred : op->get_arguments())
+        {
+            int pred_subgraph_id = get_subgraph_id(pred);
+            if (pred_subgraph_id == -1)
            {
-            ck_ops.push_back(op);
+                // predecessor doesn't belong to any sub-graph, it is an input
+                inputs.push_back(pred);
            }
+            else
+            {
+                // record sub-graph id of the predecessor
+                subgraph_ids.insert(pred_subgraph_id);
+            }
+        }
+        if (subgraph_ids.size() == 0)
+        {
+            NGRAPH_DEBUG << "[CK Extract] Start new sub-graph " << std::endl;
+            // we couldn't find any predecessor sub-graphs to extend with this node
+            // create a new sub-graph
+            MLIRSubgraph sg = MLIRSubgraph::create(this);
+            sg.add_inputs(inputs);
+            sg.add_node(op);
+            add_subgraph(sg);
+        }
+        else
+        {
+            // we have sub-graphs.
+            // check if adding this node to the sub-graph will create a cycle in the DAG
+            NGRAPH_DEBUG << "[CK Extract] Extending sub-graph. Check for cycles " << std::endl;
+            if (!check_cycles(inputs, subgraph_ids))
+            {
+                NGRAPH_DEBUG << "[CK Extract] Merging subgraphs";
+                // merge sub-graphs if needed
+                std::unordered_set<int>::iterator it = subgraph_ids.begin();
+                int sg_id = *it;
+                MLIRSubgraph& first_subgraph = get_subgraph(sg_id);
+                NGRAPH_CHECK(first_subgraph.get_id() == sg_id);
+                while (++it != subgraph_ids.end())
+                {
+                    sg_id = *it;
+                    MLIRSubgraph& subgraph = get_subgraph(sg_id);
+                    NGRAPH_CHECK(subgraph.get_id() == sg_id);
+                    first_subgraph.merge(subgraph);
                }

-    NodeVector ck_args;
-    for (auto& param : func->get_parameters())
+                first_subgraph.add_node(op);
+                first_subgraph.add_inputs(inputs);
+            }
+            else
            {
-        ck_args.push_back(param);
+                // we have a cycle, start a new sub-graph
+                MLIRSubgraph sg = MLIRSubgraph::create(this);
+                NGRAPH_DEBUG << "[CK Extract] Cycle found. Start a new subgraph";
+                // use all predecessors as graph inputs
+                NodeVector inputs = op->get_arguments();
+                sg.add_inputs(inputs);
+                sg.add_node(op);
+                add_subgraph(sg);
+            }
+        }
+        NGRAPH_DEBUG << "[CK Extract] Node Processed " << *op << std::endl;
    }

-    NodeVector ck_outputs = std::move(get_subgraph_outputs(ck_ops, {} /*exclusions*/));
-    if (ck_outputs.size() != 1)
+    NGRAPH_DEBUG << "[CK Extract] Get subgraphs output nodes" << std::endl;
+    // get output nodes for each sub-graph. Do this before attaching CK nodes since we will
+    // remove output edges from the sub-graphs.
+    for (IDGraphMap::iterator it = m_id_to_graph.begin(); it != m_id_to_graph.end(); it++)
    {
-        return false;
+        MLIRSubgraph& sg = it->second;
+        auto& nodes = sg.get_nodes();
+        NodeVector outputs = std::move(get_subgraph_outputs(NodeVector(nodes.begin(), nodes.end()),
+                                                            {} /*exclusions*/,
+                                                            false /* ignore unused */,
+                                                            false /* ignore output duplicates */));
+        sg.add_outputs(outputs);
    }

-    auto ck = std::make_shared<CompiledKernel>(ck_ops, ck_outputs, ck_args);
+    NGRAPH_DEBUG << "[CK Extract] Construct CK nodes" << std::endl;
+    // attach CK node to each sub-graph.
+    for (auto it : m_id_to_graph)
+    {
+        MLIRSubgraph sg = it.second;
+        auto& inputs = sg.get_inputs();
+        auto& outputs = sg.get_outputs();
+        auto& nodes = sg.get_nodes();
+
+        NodeVector inputs_vector(inputs.begin(), inputs.end());
+        NodeVector outputs_vector(outputs.begin(), outputs.end());
+        // must store nodes in topological order
+        auto nodes_list = subgraph_topological_sort(nodes);
+        NodeVector nodes_vector(nodes_list.begin(), nodes_list.end());
+        auto ck = std::make_shared<CompiledKernel>(nodes_vector, outputs_vector, inputs_vector);
+
+        NGRAPH_DEBUG << "[CK Extract] Graph ID = " << sg.get_id() << std::endl;
+        NGRAPH_DEBUG << "[CK Extract] Graph Nodes: " << std::endl;
+        for (auto node : nodes)
+        {
+            NGRAPH_DEBUG << "[CK Extract] " << *node << std::endl;
+        }
+
+        NGRAPH_DEBUG << "[CK Extract] Input Nodes: " << std::endl;
+        for (auto node : inputs)
+        {
+            NGRAPH_DEBUG << "[CK Extract] " << *node << std::endl;
+        }
+
+        NGRAPH_DEBUG << "[CK Extract] Output Nodes: " << std::endl;
+        for (auto node : outputs)
+        {
+            NGRAPH_DEBUG << "[CK Extract] " << *node << std::endl;
+        }

        // Connect CompiledKernel to output nodes by replacing the output descriptors of the output
        // nodes.
-    for (size_t i = 0, end = ck_outputs.size(); i < end; ++i)
+        for (size_t i = 0, end = outputs_vector.size(); i < end; ++i)
        {
-        auto& output_descs = ck_outputs[i]->get_outputs();
+            auto& output_descs = outputs_vector[i]->get_outputs();
            NGRAPH_CHECK(output_descs.size() == 1, "Unexpected multiple output descriptors");
            auto& out_desc = output_descs[0];

@@ -79,6 +246,7 @@ bool MLIRSubgraphExtractionPass::run_on_function(std::shared_ptr<Function> func)
                in_desc->replace_output(ck, i);
            }
        }
+    }

    return true;
 }
@@ -127,6 +295,30 @@ bool MLIRSubgraphExtractionPass::is_supported_mlir_op(std::shared_ptr<Node> node
    return true;
 }

+bool MLIRSubgraphExtractionPass::check_cycles(NodeVector& inputs,
+                                              std::unordered_set<int>& subgraph_ids)
+{
+    NodeVector work_list;
+    NGRAPH_DEBUG << "[CK Extract] Inputs size: " << inputs.size() << std::endl;
+    work_list.insert(work_list.end(), inputs.begin(), inputs.end());
+    while (!work_list.empty())
+    {
+        auto node = work_list.back();
+        work_list.pop_back();
+        if (subgraph_ids.find(get_subgraph_id(node)) != subgraph_ids.end())
+        {
+            // we hit one of the sub-graphs we want to extend. we have a cycle.
+            NGRAPH_DEBUG << "[CK Extract] Cycle found when trying to add node" << std::endl;
+            return true;
+        }
+        for (auto pred : node->get_arguments())
+        {
+            work_list.push_back(pred);
+        }
+    }
+    return false;
+}
+
 const std::set<std::type_index> MLIRSubgraphExtractionPass::m_supported_ops{
 #define MLIR_OP(OP) TI(ngraph::op::OP),
 #include "contrib/mlir/ops_supported.inc"

--- a/src/contrib/mlir/pass/mlir_subgraph_extraction.hpp
+++ b/src/contrib/mlir/pass/mlir_subgraph_extraction.hpp
@@ -16,26 +16,114 @@

 #pragma once

+#include <mutex>
 #include "ngraph/pass/pass.hpp"
-
 namespace ngraph
 {
    namespace pass
    {
-        /// This pass creates CompiledKernel ops enclosing sub-graphs that will be compiled and
-        /// executed by MLIR.
-        // TODO: WIP. Currently we only create a single CompiledKernel op for the whole function
-        // body.
+        /// This pass creates CompiledKernel ops enclosing maximal sub-graphs of ops that are supported by MLIR
        class MLIRSubgraphExtractionPass : public ngraph::pass::FunctionPass
        {
+            using NodeSet = std::unordered_set<std::shared_ptr<Node>>;
+
+            class MLIRSubgraph
+            {
+            private:
+                static int get_new_graph_id() { return m_curr_graph_id++; }
+                /// Create a sub-graph with a new ID.
+                MLIRSubgraph(MLIRSubgraphExtractionPass* pass)
+                    : m_graph_id(MLIRSubgraph::get_new_graph_id())
+                    , m_pass(*pass)
+                {
+                }
+
+            public:
+                /// Factory method to creates a new sub-graph with unique ID
+                static MLIRSubgraph create(MLIRSubgraphExtractionPass* pass)
+                {
+                    // mutex on global graph ID
+                    std::lock_guard<std::mutex> lock(pass->m_subgraph_mutex);
+                    return MLIRSubgraph(pass);
+                }
+                /// Get sub-graph id
+                int get_id() const { return m_graph_id; }
+                /// Get all nodes in the sub-graph.
+                NodeSet& get_nodes() { return m_nodes; }
+                /// Get input nodes. Predecessors to head nodes.
+                NodeSet& get_inputs() { return m_input_nodes; }
+                /// Get output nodes. Nodes in the sub-graph with edges to external nodes.
+                NodeSet& get_outputs() { return m_output_nodes; }
+                /// Add a list of input nodes to the sub-graph.
+                template <typename T>
+                void add_inputs(T& inputs);
+                /// Add a list of output nodes to the sub-graph.
+                template <typename T>
+                void add_outputs(T& outputs);
+                /// Merges sub-graph (other) into this sub-graph. other will be destroyed.
+                void merge(MLIRSubgraph& other);
+                /// Add one node to the sub-graph.
+                void add_node(std::shared_ptr<Node> node);
+
+            private:
+                // Unique ID for this sub-graph.
+                int m_graph_id;
+                // Actual nodes of the sub-graph
+                NodeSet m_nodes;
+                // Predecessor to head nodes in the sub-graph.
+                NodeSet m_input_nodes;
+                NodeSet m_output_nodes;
+                MLIRSubgraphExtractionPass& m_pass;
+                static int m_curr_graph_id;
+            };
+            friend class MLIRSubgraph;
+
        public:
            MLIRSubgraphExtractionPass() {}
            bool run_on_function(std::shared_ptr<Function> func) override;
            /// Checks if an ngraph node is supported by MLIR backend
            bool is_supported_mlir_op(std::shared_ptr<Node> node);
+            /// Get the sub-graph ID that a node belongs to
+            int get_subgraph_id(std::shared_ptr<Node> node)
+            {
+                auto it = m_node_to_graph.find(node);
+                return (it == m_node_to_graph.end()) ? -1 : it->second;
+            }
+            /// Get sub-graph by ID
+            MLIRSubgraph& get_subgraph(int id)
+            {
+                auto it = m_id_to_graph.find(id);
+                NGRAPH_CHECK(it != m_id_to_graph.end(), "Cannot find subgraph with ID: ", id);
+                return it->second;
+            }
+            /// Stores a sub-graph in the map
+            void add_subgraph(MLIRSubgraph& sg) { m_id_to_graph.emplace(sg.get_id(), sg); }
+            /// Checks if adding a node to an extracted sub-graph will cause a DAG cycle
+            /// inputs: the list of input nodes outside sub-graphs to the node we want to add.
+            /// subgraph_ids: the sub-graphs the predecessor nodes belong to.
+            /// It traverses backwards from all input nodes and checks if we reach any node that already
+            /// belongs to one of the sub-graph ids. If so, we have a cycle.
+            ///
+            /// Example:
+            /// A(1)
+            /// |   \
+            /// B(1) C
+            /// |  /
+            /// D
+            /// we want to add D to sub-graph 1. C is an input to D. sugraph_ids are 1
+            /// we traverse backwards C->A(1) and find 1, then we cannot add D since we will form a cycle
+            bool check_cycles(NodeVector& inputs, std::unordered_set<int>& subgraph_ids);

        private:
            static const std::set<std::type_index> m_supported_ops;
+
+        private:
+            using IDGraphMap = std::unordered_map<int, MLIRSubgraph>;
+            using NodeGraphMap = std::unordered_map<std::shared_ptr<Node>, int>;
+            IDGraphMap m_id_to_graph;
+            NodeGraphMap m_node_to_graph;
+            // Mutex over sub-graph IDs
+            std::mutex m_subgraph_mutex;
        };
    }
 }
--- a/src/ngraph/graph_util.cpp
+++ b/src/ngraph/graph_util.cpp
@@ -463,7 +463,8 @@ bool ngraph::is_one(std::shared_ptr<Node> reduce_constant)

 NodeVector ngraph::get_subgraph_outputs(const NodeVector& nodes,
                                        const NodeVector& exclusions,
-                                        bool ignore_unused)
+                                        bool ignore_unused,
+                                        bool ignore_output_duplicates)
 {
    std::set<shared_ptr<Node>> exclusions_set(exclusions.begin(), exclusions.end());
    std::set<shared_ptr<Node>> nodes_set(nodes.begin(), nodes.end());
@@ -479,7 +480,11 @@ NodeVector ngraph::get_subgraph_outputs(const NodeVector& nodes,

        for (const auto& u : n->get_users())
        {
-            if (nodes_set.count(u) == 0 && (!ignore_unused || is_used(u.get())))
+            bool add_output = nodes_set.count(u) == 0 && (!ignore_unused || is_used(u.get()));
+            // check if output is already captured
+            add_output &= (ignore_output_duplicates ||
+                           std::find(outputs.begin(), outputs.end(), n) == outputs.end());
+            if (add_output)
            {
                outputs.push_back(n);
            }

--- a/src/ngraph/graph_util.hpp
+++ b/src/ngraph/graph_util.hpp
@@ -275,7 +275,8 @@ namespace ngraph

    NodeVector get_subgraph_outputs(const NodeVector& nodes,
                                    const NodeVector& exclusions,
-                                    bool ignore_unused = false);
+                                    bool ignore_unused = false,
+                                    bool ignore_output_duplicates = true);

    // Extract sub-graph computing the `results`. Stops backward traversal at either a Parameter node
    // or a node that belongs to args

--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -114,7 +114,6 @@ set(SRC
    op/update_slice.cpp
    pass/cpu_assignment.cpp
    pass/cpu_collapse_dims.cpp
-    pass/cpu_compiled_kernel_fusion.cpp
    pass/cpu_fusion.cpp
    pass/cpu_horizontal_fusion.cpp
    pass/cpu_layout.cpp

--- a/src/ngraph/runtime/cpu/pass/cpu_compiled_kernel_fusion.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_compiled_kernel_fusion.cpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#include <algorithm>
-#include <iostream>
-#include <map>
-#include <memory>
-
-#include "ngraph/graph_util.hpp"
-#include "ngraph/log.hpp"
-#include "ngraph/op/abs.hpp"
-#include "ngraph/op/add.hpp"
-#include "ngraph/op/experimental/compiled_kernel.hpp"
-#include "ngraph/op/get_output_element.hpp"
-#include "ngraph/op/maximum.hpp"
-#include "ngraph/op/minimum.hpp"
-#include "ngraph/op/negative.hpp"
-#include "ngraph/op/relu.hpp"
-#include "ngraph/op/subtract.hpp"
-#include "ngraph/op/util/binary_elementwise_arithmetic.hpp"
-#include "ngraph/op/util/unary_elementwise_arithmetic.hpp"
-#include "ngraph/runtime/cpu/pass/cpu_compiled_kernel_fusion.hpp"
-
-#define TI(x) std::type_index(typeid(x))
-
-using namespace ngraph;
-
-struct LKGraph
-{
-    LKGraph(const NodeVector& ns, const NodeVector& ins)
-        : m_inputs(ins)
-        , m_nodes(ns)
-    {
-    }
-    NodeVector m_inputs;
-    NodeVector m_nodes;
-};
-
-class CompiledKernelCollector
-{
-public:
-    CompiledKernelCollector(std::shared_ptr<Function> f, size_t min_nodes_to_fuse)
-    {
-        for (auto n : f->get_ordered_ops())
-        {
-            if (is_fusible(n))
-            {
-                auto arg_from_fusible_group = collect_fusible_args(n);
-                // create a new group
-                if (!arg_from_fusible_group)
-                {
-                    m_heads.insert(std::make_pair(n, n));
-                    m_graphs.insert(std::make_pair(n, LKGraph{{n}, n->get_arguments()}));
-                    NGRAPH_DEBUG << "Created a new group for " << n->get_name();
-                    log_group(n);
-                }
-                else
-                {
-                    auto smallest_head = m_heads.at(arg_from_fusible_group);
-                    auto& ckgraph = m_graphs.at(smallest_head);
-                    ckgraph.m_nodes.push_back(n);
-                    for (auto arg : n->get_arguments())
-                    {
-                        if (is_leaf(arg))
-                        {
-                            ckgraph.m_inputs.push_back(arg);
-                        }
-                    }
-                    m_heads.insert(std::make_pair(n, smallest_head));
-                    log_group(smallest_head);
-                }
-            }
-        }
-
-        prune_graphs(min_nodes_to_fuse);
-    }
-
-    const std::vector<std::shared_ptr<op::CompiledKernel>> get_compiled_kernels() const
-    {
-        std::vector<std::shared_ptr<op::CompiledKernel>> cks;
-        for (auto e : m_graphs)
-        {
-            auto& ckg = e.second;
-            NodeVector member_outputs = ngraph::get_subgraph_outputs(ckg.m_nodes, NodeVector{});
-            auto ck =
-                std::make_shared<op::CompiledKernel>(ckg.m_nodes, member_outputs, ckg.m_inputs);
-            cks.push_back(ck);
-        }
-        return cks;
-    }
-
-private:
-    static bool is_fusible(std::shared_ptr<Node> n)
-    {
-        static const std::set<std::type_index> fusible_ops_set{TI(ngraph::op::Abs),
-                                                               TI(ngraph::op::Add),
-                                                               TI(ngraph::op::Negative),
-                                                               TI(ngraph::op::Subtract),
-                                                               TI(ngraph::op::Relu),
-                                                               TI(ngraph::op::Minimum),
-                                                               TI(ngraph::op::Maximum)};
-
-        const Node& node = *n;
-        return fusible_ops_set.count(TI(node)) != 0;
-
-        // return (std::dynamic_pointer_cast<op::util::BinaryElementwiseArithmetic>(n) ||
-        //         std::dynamic_pointer_cast<op::util::UnaryElementwiseArithmetic>(n));
-    }
-
-    bool is_leaf(std::shared_ptr<Node> src) { return src->is_parameter() || src->is_constant(); }
-    void prune_graphs(size_t min_nodes_to_fuse)
-    {
-        for (auto it = m_graphs.begin(); it != m_graphs.end();)
-        {
-            if (it->second.m_nodes.size() < min_nodes_to_fuse)
-            {
-                it = m_graphs.erase(it);
-            }
-            else
-            {
-                it++;
-            }
-        }
-    }
-
-    void log_group(std::shared_ptr<Node> head) const
-    {
-        NGRAPH_DEBUG << "Group leader : " << head->get_name() << std::endl;
-        NGRAPH_DEBUG << "Group members : " << m_graphs.at(head).m_nodes << std::endl;
-        NGRAPH_DEBUG << "Inputs: " << m_graphs.at(head).m_inputs << std::endl;
-    }
-
-    std::shared_ptr<Node> collect_fusible_args(std::shared_ptr<Node> n)
-    {
-        std::shared_ptr<Node> arg_from_fusible_group;
-        for (auto arg : n->get_arguments())
-        {
-            // an argument is fusible and a part of some group
-            NGRAPH_DEBUG << "Considering " << arg->get_name();
-            if (m_heads.count(arg) != 0)
-            {
-                if (!arg_from_fusible_group)
-                {
-                    arg_from_fusible_group = arg;
-                }
-                else
-                {
-                    if (!is_leaf(arg) && m_heads.at(arg) != m_heads.at(arg_from_fusible_group))
-                    {
-                        return {nullptr};
-                    }
-                }
-            }
-        }
-        return arg_from_fusible_group;
-    }
-
-    std::unordered_map<std::shared_ptr<Node>, LKGraph> m_graphs;
-    std::unordered_map<std::shared_ptr<Node>, std::shared_ptr<Node>> m_heads;
-};
-
-bool ngraph::runtime::cpu::pass::CPUCompiledKernelFusion::run_on_function(
-    std::shared_ptr<ngraph::Function> function)
-{
-    CompiledKernelCollector ckc(function, m_min_kernel_size);
-    auto compiled_kernels = ckc.get_compiled_kernels();
-
-    for (auto ck : compiled_kernels)
-    {
-        auto outputs = ck->get_kernel_outputs();
-        std::set<std::shared_ptr<Node>> ck_nodes_set(ck->get_node_list().begin(),
-                                                     ck->get_node_list().end());
-        for (size_t i = 0; i < outputs.size(); i++)
-        {
-            auto ith_goe = std::make_shared<ngraph::op::GetOutputElement>(ck, i);
-            auto& ith_output = ith_goe->get_outputs().at(0);
-
-            if (outputs.at(i)->get_outputs().size() > 1)
-            {
-                throw ngraph_error(
-                    "support for fusing multi-output nodes in loop kernels isn't yet implemented");
-            }
-
-            // TODO: revisit when we need support for multi-output nodes
-            auto& orig_output = outputs.at(i)->get_outputs().at(0);
-
-            // this is needed since replace_output modifies orig_output.get_inputs()
-            std::set<ngraph::descriptor::Input*> inputs_copy{begin(orig_output.get_inputs()),
-                                                             end(orig_output.get_inputs())};
-            for (auto input : inputs_copy)
-            {
-                // this user is NOT internal to this loop kernel
-                // so it needs to be replaced with corresponding ck's GOE
-                if (ck_nodes_set.count(input->get_node()) == 0)
-                {
-                    input->replace_output(ith_output);
-                }
-            }
-        }
-    }
-
-    return !compiled_kernels.empty();
-}
--- a/src/ngraph/runtime/cpu/pass/cpu_compiled_kernel_fusion.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_compiled_kernel_fusion.hpp
-//*****************************************************************************
-// Copyright 2017-2019 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#pragma once
-
-#include "ngraph/pass/pass.hpp"
-
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace cpu
-        {
-            namespace pass
-            {
-                class CPUCompiledKernelFusion : public ngraph::pass::FunctionPass
-                {
-                public:
-                    CPUCompiledKernelFusion(size_t min_kernel_size = 2)
-                        : FunctionPass()
-                        , m_min_kernel_size(min_kernel_size)
-                    {
-                    }
-
-                    bool run_on_function(std::shared_ptr<ngraph::Function> function) override;
-
-                protected:
-                    size_t m_min_kernel_size;
-                };
-            }
-        }
-    }
-}
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -172,6 +172,10 @@ set(MULTI_TEST_SRC
    dynamic.in.cpp
 )

+if (NGRAPH_MLIR_ENABLE)
+    list(APPEND MULTI_TEST_SRC backend_mlir.in.cpp)
+endif()
+
 if(NGRAPH_DISTRIBUTED_ENABLE)
    list(APPEND MULTI_TEST_SRC distributed.in.cpp)
 endif()

--- a/test/backend_binary_elementwise.in.cpp
+++ b/test/backend_binary_elementwise.in.cpp
@@ -67,36 +67,6 @@ NGRAPH_TEST(${BACKEND_NAME}, add)
                                  (test::NDArray<float, 2>({{6, 8}, {10, 12}})).get_vector()));
 }

-NGRAPH_TEST(${BACKEND_NAME}, dot_add)
-{
-    Shape shape_in1{2, 3};
-    Shape shape_in2{3, 3};
-    Shape shape_out{2, 3};
-    auto A = make_shared<op::Parameter>(element::f32, shape_in1);
-    auto B = make_shared<op::Parameter>(element::f32, shape_in2);
-    auto dot = make_shared<op::Dot>(A, B);
-    auto C = make_shared<op::Parameter>(element::f32, shape_out);
-    auto add = make_shared<op::Add>(dot, C);
-    auto f = make_shared<Function>(add, ParameterVector{A, B, C});
-
-    auto backend = runtime::Backend::create("${BACKEND_NAME}");
-
-    // Create some tensors for input/output
-    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape_in1);
-    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape_in2);
-    shared_ptr<runtime::Tensor> c = backend->create_tensor(element::f32, shape_out);
-    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shape_out);
-
-    copy_data(a, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
-    copy_data(b, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f});
-    copy_data(c, vector<float>{5.f, 4.f, 3.f, 2.f, 1.f, 0.f});
-
-    auto handle = backend->compile(f);
-    handle->call_with_validate({result}, {a, b, c});
-    EXPECT_TRUE(test::all_close_f(read_vector<float>(result),
-                                  vector<float>{35.f, 40.f, 45.f, 68.f, 82.f, 96.f}));
-}
-
 NGRAPH_TEST(${BACKEND_NAME}, add_overload)
 {
    Shape shape{2, 2};

--- a/test/backend_mlir.in.cpp
+++ b/test/backend_mlir.in.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+// End to end test for MLIR. Add tests here that are specific to test MLIR functionality
+// MLIR is implicitly tested during other unit-tests as well.
+
+#include "gtest/gtest.h"
+#include "ngraph/ngraph.hpp"
+#include "util/all_close.hpp"
+#include "util/all_close_f.hpp"
+#include "util/ndarray.hpp"
+#include "util/test_control.hpp"
+#include "util/test_tools.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+static string s_manifest = "${MANIFEST}";
+
+// Combined ops test
+NGRAPH_TEST(${BACKEND_NAME}, mlir_dot_add)
+{
+    Shape shape_in1{2, 3};
+    Shape shape_in2{3, 3};
+    Shape shape_out{2, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto B = make_shared<op::Parameter>(element::f32, shape_in2);
+    auto dot = make_shared<op::Dot>(A, B);
+    auto C = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto add = make_shared<op::Add>(dot, C);
+    auto f = make_shared<Function>(add, ParameterVector{A, B, C});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape_in2);
+    shared_ptr<runtime::Tensor> c = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shape_out);
+
+    copy_data(a, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
+    copy_data(b, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f});
+    copy_data(c, vector<float>{5.f, 4.f, 3.f, 2.f, 1.f, 0.f});
+
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {a, b, c});
+    EXPECT_TRUE(test::all_close_f(read_vector<float>(result),
+                                  vector<float>{35.f, 40.f, 45.f, 68.f, 82.f, 96.f}));
+}
+
+// Sub-graph extraction tests
+NGRAPH_TEST(${BACKEND_NAME}, mlir_subgraphs_dot_add)
+{
+    Shape shape_in1{2, 3};
+    Shape shape_in2{3, 3};
+    Shape shape_out{2, 3};
+
+    // sub-graph 1
+    auto P1 = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto P2 = make_shared<op::Parameter>(element::f32, shape_in2);
+    auto P3 = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto dot = make_shared<op::Dot>(P1, P2);
+    auto sg1_output = make_shared<op::Add>(dot, P3);
+
+    // sub-graph 2
+    auto P4 = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto P5 = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto P6 = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto add = make_shared<op::Add>(P4, P5);
+    auto sg2_output = make_shared<op::Add>(add, P6);
+
+    auto out = make_shared<op::Maximum>(sg1_output, sg2_output);
+
+    auto f = make_shared<Function>(out, ParameterVector{P1, P2, P3, P4, P5, P6});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    shared_ptr<runtime::Tensor> p1 = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> p2 = backend->create_tensor(element::f32, shape_in2);
+    shared_ptr<runtime::Tensor> p3 = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> p4 = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> p5 = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> p6 = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shape_out);
+
+    copy_data(p1, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
+    copy_data(p2, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f});
+    copy_data(p3, vector<float>{5.f, 4.f, 3.f, 2.f, 1.f, 0.f});
+
+    copy_data(p4, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
+    copy_data(p5, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
+    copy_data(p6, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
+
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {p1, p2, p3, p4, p5, p6});
+    EXPECT_TRUE(
+        test::all_close_f(read_vector<float>(result), vector<float>{35, 40, 45, 68, 82, 96}));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, mlir_subgraphs_dot_add_2)
+{
+    // Tests 2 sub-graphs merged at a join point into one.
+    Shape shape_in1{2, 3};
+    Shape shape_in2{3, 3};
+    Shape shape_out{2, 3};
+
+    // sub-graph 1
+    auto P1 = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto P2 = make_shared<op::Parameter>(element::f32, shape_in2);
+    auto P3 = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto dot = make_shared<op::Dot>(P1, P2);
+    auto sg1_output = make_shared<op::Add>(dot, P3);
+
+    // sub-graph 2
+    auto P4 = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto P5 = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto P6 = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto add = make_shared<op::Add>(P4, P5);
+    auto sg2_output = make_shared<op::Add>(add, P6);
+
+    auto add2 = make_shared<op::Add>(sg1_output, sg2_output);
+    auto abs = make_shared<op::Abs>(add2);
+
+    auto f = make_shared<Function>(abs, ParameterVector{P1, P2, P3, P4, P5, P6});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    shared_ptr<runtime::Tensor> p1 = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> p2 = backend->create_tensor(element::f32, shape_in2);
+    shared_ptr<runtime::Tensor> p3 = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> p4 = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> p5 = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> p6 = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shape_out);
+
+    copy_data(p1, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
+    copy_data(p2, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f});
+    copy_data(p3, vector<float>{5.f, 4.f, 3.f, 2.f, 1.f, 0.f});
+
+    copy_data(p4, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
+    copy_data(p5, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
+    copy_data(p6, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
+
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {p1, p2, p3, p4, p5, p6});
+    EXPECT_TRUE(
+        test::all_close_f(read_vector<float>(result), vector<float>{38, 46, 54, 80, 97, 114}));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, mlir_subgraphs_dot_add_3)
+{
+    // Tests 3 distinct sub-graphs
+    Shape shape_in1{2, 3};
+    Shape shape_in2{3, 3};
+    Shape shape_out{2, 3};
+
+    // sub-graph 1
+    auto P1 = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto P2 = make_shared<op::Parameter>(element::f32, shape_in2);
+    auto P3 = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto dot = make_shared<op::Dot>(P1, P2);
+    auto sg1_output = make_shared<op::Add>(dot, P3);
+
+    // sub-graph 2
+    auto P4 = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto P5 = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto P6 = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto add = make_shared<op::Add>(P4, P5);
+    auto sg2_output = make_shared<op::Add>(add, P6);
+
+    auto max = make_shared<op::Maximum>(sg1_output, sg2_output);
+    auto add2 = make_shared<op::Add>(max, max);
+
+    auto f = make_shared<Function>(add2, ParameterVector{P1, P2, P3, P4, P5, P6});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    shared_ptr<runtime::Tensor> p1 = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> p2 = backend->create_tensor(element::f32, shape_in2);
+    shared_ptr<runtime::Tensor> p3 = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> p4 = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> p5 = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> p6 = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shape_out);
+
+    copy_data(p1, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
+    copy_data(p2, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f});
+    copy_data(p3, vector<float>{5.f, 4.f, 3.f, 2.f, 1.f, 0.f});
+
+    copy_data(p4, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
+    copy_data(p5, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
+    copy_data(p6, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
+
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {p1, p2, p3, p4, p5, p6});
+    EXPECT_TRUE(
+        test::all_close_f(read_vector<float>(result), vector<float>{70, 80, 90, 136, 164, 192}));
+}
+
+NGRAPH_TEST(${BACKEND_NAME}, mlir_subgraphs_cycle)
+{
+    // Tests 3 distinct sub-graphs
+    Shape shape_in1{2, 3};
+    Shape shape_in2{3, 3};
+    Shape shape_out{2, 3};
+
+    // sub-graph 1
+    auto P1 = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto P2 = make_shared<op::Parameter>(element::f32, shape_in2);
+    auto P3 = make_shared<op::Parameter>(element::f32, shape_in1);
+    auto dot = make_shared<op::Dot>(P1, P2);
+    auto add = make_shared<op::Add>(dot, P3);
+    auto abs = make_shared<op::Abs>(add);
+    auto add2 = make_shared<op::Add>(add, abs);
+
+    auto f = make_shared<Function>(add2, ParameterVector{P1, P2, P3});
+
+    auto backend = runtime::Backend::create("${BACKEND_NAME}");
+
+    // Create some tensors for input/output
+    shared_ptr<runtime::Tensor> p1 = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> p2 = backend->create_tensor(element::f32, shape_in2);
+    shared_ptr<runtime::Tensor> p3 = backend->create_tensor(element::f32, shape_in1);
+    shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shape_out);
+
+    copy_data(p1, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
+    copy_data(p2, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f});
+    copy_data(p3, vector<float>{5.f, 4.f, 3.f, 2.f, 1.f, 0.f});
+
+    auto handle = backend->compile(f);
+    handle->call_with_validate({result}, {p1, p2, p3});
+    EXPECT_TRUE(
+        test::all_close_f(read_vector<float>(result), vector<float>{70, 80, 90, 136, 164, 192}));
+}