Commit f4b487a4 authored by Nagy Mostafa's avatar Nagy Mostafa Committed by Scott Cyphers

[MLIR] Add sub-graph extraction support (#3101)

* Initial sub-graph extraction

* Works without detaching input edges from sub-graph

* Added removing input edges to graph

* Works with whole func sub-graphs. Inputs edges to sub-graph are still there

* Works on 2 exclusive sub-graphs. Still not on merged sub-graphs

* Revert removing inputs to sub-graph. nGraph validation crashes

* Added 3 sub-graph test. Remove compiled_kernel fusion pass. Comments

* Revert some changes

* Added cycle detection. Removed unit-tests to backend_mlir.in.cpp. Still not fully functional

* Construct CK nodes after finding outputs to preserve the graph.

* Fix topological sort. UTs pass.

* Minor fixes

* PR fixes

* Enable mlir tests only when building with MLIR on
parent 4b009f09
......@@ -16,26 +16,114 @@
#pragma once
#include <mutex>
#include "ngraph/pass/pass.hpp"
namespace ngraph
{
namespace pass
{
/// This pass creates CompiledKernel ops enclosing sub-graphs that will be compiled and
/// executed by MLIR.
// TODO: WIP. Currently we only create a single CompiledKernel op for the whole function
// body.
/// This pass creates CompiledKernel ops enclosing maximal sub-graphs of ops that are supported by MLIR
class MLIRSubgraphExtractionPass : public ngraph::pass::FunctionPass
{
using NodeSet = std::unordered_set<std::shared_ptr<Node>>;
class MLIRSubgraph
{
private:
static int get_new_graph_id() { return m_curr_graph_id++; }
/// Create a sub-graph with a new ID.
MLIRSubgraph(MLIRSubgraphExtractionPass* pass)
: m_graph_id(MLIRSubgraph::get_new_graph_id())
, m_pass(*pass)
{
}
public:
/// Factory method to creates a new sub-graph with unique ID
static MLIRSubgraph create(MLIRSubgraphExtractionPass* pass)
{
// mutex on global graph ID
std::lock_guard<std::mutex> lock(pass->m_subgraph_mutex);
return MLIRSubgraph(pass);
}
/// Get sub-graph id
int get_id() const { return m_graph_id; }
/// Get all nodes in the sub-graph.
NodeSet& get_nodes() { return m_nodes; }
/// Get input nodes. Predecessors to head nodes.
NodeSet& get_inputs() { return m_input_nodes; }
/// Get output nodes. Nodes in the sub-graph with edges to external nodes.
NodeSet& get_outputs() { return m_output_nodes; }
/// Add a list of input nodes to the sub-graph.
template <typename T>
void add_inputs(T& inputs);
/// Add a list of output nodes to the sub-graph.
template <typename T>
void add_outputs(T& outputs);
/// Merges sub-graph (other) into this sub-graph. other will be destroyed.
void merge(MLIRSubgraph& other);
/// Add one node to the sub-graph.
void add_node(std::shared_ptr<Node> node);
private:
// Unique ID for this sub-graph.
int m_graph_id;
// Actual nodes of the sub-graph
NodeSet m_nodes;
// Predecessor to head nodes in the sub-graph.
NodeSet m_input_nodes;
NodeSet m_output_nodes;
MLIRSubgraphExtractionPass& m_pass;
static int m_curr_graph_id;
};
friend class MLIRSubgraph;
public:
MLIRSubgraphExtractionPass() {}
bool run_on_function(std::shared_ptr<Function> func) override;
/// Checks if an ngraph node is supported by MLIR backend
bool is_supported_mlir_op(std::shared_ptr<Node> node);
/// Get the sub-graph ID that a node belongs to
int get_subgraph_id(std::shared_ptr<Node> node)
{
auto it = m_node_to_graph.find(node);
return (it == m_node_to_graph.end()) ? -1 : it->second;
}
/// Get sub-graph by ID
MLIRSubgraph& get_subgraph(int id)
{
auto it = m_id_to_graph.find(id);
NGRAPH_CHECK(it != m_id_to_graph.end(), "Cannot find subgraph with ID: ", id);
return it->second;
}
/// Stores a sub-graph in the map
void add_subgraph(MLIRSubgraph& sg) { m_id_to_graph.emplace(sg.get_id(), sg); }
/// Checks if adding a node to an extracted sub-graph will cause a DAG cycle
/// inputs: the list of input nodes outside sub-graphs to the node we want to add.
/// subgraph_ids: the sub-graphs the predecessor nodes belong to.
/// It traverses backwards from all input nodes and checks if we reach any node that already
/// belongs to one of the sub-graph ids. If so, we have a cycle.
///
/// Example:
/// A(1)
/// | \
/// B(1) C
/// | /
/// D
/// we want to add D to sub-graph 1. C is an input to D. sugraph_ids are 1
/// we traverse backwards C->A(1) and find 1, then we cannot add D since we will form a cycle
bool check_cycles(NodeVector& inputs, std::unordered_set<int>& subgraph_ids);
private:
static const std::set<std::type_index> m_supported_ops;
private:
using IDGraphMap = std::unordered_map<int, MLIRSubgraph>;
using NodeGraphMap = std::unordered_map<std::shared_ptr<Node>, int>;
IDGraphMap m_id_to_graph;
NodeGraphMap m_node_to_graph;
// Mutex over sub-graph IDs
std::mutex m_subgraph_mutex;
};
}
}
......@@ -463,7 +463,8 @@ bool ngraph::is_one(std::shared_ptr<Node> reduce_constant)
NodeVector ngraph::get_subgraph_outputs(const NodeVector& nodes,
const NodeVector& exclusions,
bool ignore_unused)
bool ignore_unused,
bool ignore_output_duplicates)
{
std::set<shared_ptr<Node>> exclusions_set(exclusions.begin(), exclusions.end());
std::set<shared_ptr<Node>> nodes_set(nodes.begin(), nodes.end());
......@@ -479,7 +480,11 @@ NodeVector ngraph::get_subgraph_outputs(const NodeVector& nodes,
for (const auto& u : n->get_users())
{
if (nodes_set.count(u) == 0 && (!ignore_unused || is_used(u.get())))
bool add_output = nodes_set.count(u) == 0 && (!ignore_unused || is_used(u.get()));
// check if output is already captured
add_output &= (ignore_output_duplicates ||
std::find(outputs.begin(), outputs.end(), n) == outputs.end());
if (add_output)
{
outputs.push_back(n);
}
......
......@@ -275,7 +275,8 @@ namespace ngraph
NodeVector get_subgraph_outputs(const NodeVector& nodes,
const NodeVector& exclusions,
bool ignore_unused = false);
bool ignore_unused = false,
bool ignore_output_duplicates = true);
// Extract sub-graph computing the `results`. Stops backward traversal at either a Parameter node
// or a node that belongs to args
......
......@@ -114,7 +114,6 @@ set(SRC
op/update_slice.cpp
pass/cpu_assignment.cpp
pass/cpu_collapse_dims.cpp
pass/cpu_compiled_kernel_fusion.cpp
pass/cpu_fusion.cpp
pass/cpu_horizontal_fusion.cpp
pass/cpu_layout.cpp
......
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <algorithm>
#include <iostream>
#include <map>
#include <memory>
#include "ngraph/graph_util.hpp"
#include "ngraph/log.hpp"
#include "ngraph/op/abs.hpp"
#include "ngraph/op/add.hpp"
#include "ngraph/op/experimental/compiled_kernel.hpp"
#include "ngraph/op/get_output_element.hpp"
#include "ngraph/op/maximum.hpp"
#include "ngraph/op/minimum.hpp"
#include "ngraph/op/negative.hpp"
#include "ngraph/op/relu.hpp"
#include "ngraph/op/subtract.hpp"
#include "ngraph/op/util/binary_elementwise_arithmetic.hpp"
#include "ngraph/op/util/unary_elementwise_arithmetic.hpp"
#include "ngraph/runtime/cpu/pass/cpu_compiled_kernel_fusion.hpp"
#define TI(x) std::type_index(typeid(x))
using namespace ngraph;
struct LKGraph
{
LKGraph(const NodeVector& ns, const NodeVector& ins)
: m_inputs(ins)
, m_nodes(ns)
{
}
NodeVector m_inputs;
NodeVector m_nodes;
};
class CompiledKernelCollector
{
public:
CompiledKernelCollector(std::shared_ptr<Function> f, size_t min_nodes_to_fuse)
{
for (auto n : f->get_ordered_ops())
{
if (is_fusible(n))
{
auto arg_from_fusible_group = collect_fusible_args(n);
// create a new group
if (!arg_from_fusible_group)
{
m_heads.insert(std::make_pair(n, n));
m_graphs.insert(std::make_pair(n, LKGraph{{n}, n->get_arguments()}));
NGRAPH_DEBUG << "Created a new group for " << n->get_name();
log_group(n);
}
else
{
auto smallest_head = m_heads.at(arg_from_fusible_group);
auto& ckgraph = m_graphs.at(smallest_head);
ckgraph.m_nodes.push_back(n);
for (auto arg : n->get_arguments())
{
if (is_leaf(arg))
{
ckgraph.m_inputs.push_back(arg);
}
}
m_heads.insert(std::make_pair(n, smallest_head));
log_group(smallest_head);
}
}
}
prune_graphs(min_nodes_to_fuse);
}
const std::vector<std::shared_ptr<op::CompiledKernel>> get_compiled_kernels() const
{
std::vector<std::shared_ptr<op::CompiledKernel>> cks;
for (auto e : m_graphs)
{
auto& ckg = e.second;
NodeVector member_outputs = ngraph::get_subgraph_outputs(ckg.m_nodes, NodeVector{});
auto ck =
std::make_shared<op::CompiledKernel>(ckg.m_nodes, member_outputs, ckg.m_inputs);
cks.push_back(ck);
}
return cks;
}
private:
static bool is_fusible(std::shared_ptr<Node> n)
{
static const std::set<std::type_index> fusible_ops_set{TI(ngraph::op::Abs),
TI(ngraph::op::Add),
TI(ngraph::op::Negative),
TI(ngraph::op::Subtract),
TI(ngraph::op::Relu),
TI(ngraph::op::Minimum),
TI(ngraph::op::Maximum)};
const Node& node = *n;
return fusible_ops_set.count(TI(node)) != 0;
// return (std::dynamic_pointer_cast<op::util::BinaryElementwiseArithmetic>(n) ||
// std::dynamic_pointer_cast<op::util::UnaryElementwiseArithmetic>(n));
}
bool is_leaf(std::shared_ptr<Node> src) { return src->is_parameter() || src->is_constant(); }
void prune_graphs(size_t min_nodes_to_fuse)
{
for (auto it = m_graphs.begin(); it != m_graphs.end();)
{
if (it->second.m_nodes.size() < min_nodes_to_fuse)
{
it = m_graphs.erase(it);
}
else
{
it++;
}
}
}
void log_group(std::shared_ptr<Node> head) const
{
NGRAPH_DEBUG << "Group leader : " << head->get_name() << std::endl;
NGRAPH_DEBUG << "Group members : " << m_graphs.at(head).m_nodes << std::endl;
NGRAPH_DEBUG << "Inputs: " << m_graphs.at(head).m_inputs << std::endl;
}
std::shared_ptr<Node> collect_fusible_args(std::shared_ptr<Node> n)
{
std::shared_ptr<Node> arg_from_fusible_group;
for (auto arg : n->get_arguments())
{
// an argument is fusible and a part of some group
NGRAPH_DEBUG << "Considering " << arg->get_name();
if (m_heads.count(arg) != 0)
{
if (!arg_from_fusible_group)
{
arg_from_fusible_group = arg;
}
else
{
if (!is_leaf(arg) && m_heads.at(arg) != m_heads.at(arg_from_fusible_group))
{
return {nullptr};
}
}
}
}
return arg_from_fusible_group;
}
std::unordered_map<std::shared_ptr<Node>, LKGraph> m_graphs;
std::unordered_map<std::shared_ptr<Node>, std::shared_ptr<Node>> m_heads;
};
bool ngraph::runtime::cpu::pass::CPUCompiledKernelFusion::run_on_function(
std::shared_ptr<ngraph::Function> function)
{
CompiledKernelCollector ckc(function, m_min_kernel_size);
auto compiled_kernels = ckc.get_compiled_kernels();
for (auto ck : compiled_kernels)
{
auto outputs = ck->get_kernel_outputs();
std::set<std::shared_ptr<Node>> ck_nodes_set(ck->get_node_list().begin(),
ck->get_node_list().end());
for (size_t i = 0; i < outputs.size(); i++)
{
auto ith_goe = std::make_shared<ngraph::op::GetOutputElement>(ck, i);
auto& ith_output = ith_goe->get_outputs().at(0);
if (outputs.at(i)->get_outputs().size() > 1)
{
throw ngraph_error(
"support for fusing multi-output nodes in loop kernels isn't yet implemented");
}
// TODO: revisit when we need support for multi-output nodes
auto& orig_output = outputs.at(i)->get_outputs().at(0);
// this is needed since replace_output modifies orig_output.get_inputs()
std::set<ngraph::descriptor::Input*> inputs_copy{begin(orig_output.get_inputs()),
end(orig_output.get_inputs())};
for (auto input : inputs_copy)
{
// this user is NOT internal to this loop kernel
// so it needs to be replaced with corresponding ck's GOE
if (ck_nodes_set.count(input->get_node()) == 0)
{
input->replace_output(ith_output);
}
}
}
}
return !compiled_kernels.empty();
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include "ngraph/pass/pass.hpp"
namespace ngraph
{
namespace runtime
{
namespace cpu
{
namespace pass
{
class CPUCompiledKernelFusion : public ngraph::pass::FunctionPass
{
public:
CPUCompiledKernelFusion(size_t min_kernel_size = 2)
: FunctionPass()
, m_min_kernel_size(min_kernel_size)
{
}
bool run_on_function(std::shared_ptr<ngraph::Function> function) override;
protected:
size_t m_min_kernel_size;
};
}
}
}
}
......@@ -172,6 +172,10 @@ set(MULTI_TEST_SRC
dynamic.in.cpp
)
if (NGRAPH_MLIR_ENABLE)
list(APPEND MULTI_TEST_SRC backend_mlir.in.cpp)
endif()
if(NGRAPH_DISTRIBUTED_ENABLE)
list(APPEND MULTI_TEST_SRC distributed.in.cpp)
endif()
......
......@@ -67,36 +67,6 @@ NGRAPH_TEST(${BACKEND_NAME}, add)
(test::NDArray<float, 2>({{6, 8}, {10, 12}})).get_vector()));
}
NGRAPH_TEST(${BACKEND_NAME}, dot_add)
{
Shape shape_in1{2, 3};
Shape shape_in2{3, 3};
Shape shape_out{2, 3};
auto A = make_shared<op::Parameter>(element::f32, shape_in1);
auto B = make_shared<op::Parameter>(element::f32, shape_in2);
auto dot = make_shared<op::Dot>(A, B);
auto C = make_shared<op::Parameter>(element::f32, shape_out);
auto add = make_shared<op::Add>(dot, C);
auto f = make_shared<Function>(add, ParameterVector{A, B, C});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
shared_ptr<runtime::Tensor> a = backend->create_tensor(element::f32, shape_in1);
shared_ptr<runtime::Tensor> b = backend->create_tensor(element::f32, shape_in2);
shared_ptr<runtime::Tensor> c = backend->create_tensor(element::f32, shape_out);
shared_ptr<runtime::Tensor> result = backend->create_tensor(element::f32, shape_out);
copy_data(a, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f});
copy_data(b, vector<float>{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f});
copy_data(c, vector<float>{5.f, 4.f, 3.f, 2.f, 1.f, 0.f});
auto handle = backend->compile(f);
handle->call_with_validate({result}, {a, b, c});
EXPECT_TRUE(test::all_close_f(read_vector<float>(result),
vector<float>{35.f, 40.f, 45.f, 68.f, 82.f, 96.f}));
}
NGRAPH_TEST(${BACKEND_NAME}, add_overload)
{
Shape shape{2, 2};
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment