Commit 978691b4 authored by Diego Caballero's avatar Diego Caballero Committed by nmostafa

[MLIR] Rename LoopKernel->ComputedKernel. Move it to experimental core ops (#12)

We want to use ComputedKernel for any target to delimit sub-graphs to be
compiled and executed with MLIR.
parent d9dd03ce
...@@ -168,6 +168,8 @@ set (SRC ...@@ -168,6 +168,8 @@ set (SRC
op/experimental/quantized_dot.hpp op/experimental/quantized_dot.hpp
op/experimental/quantized_dot_bias.cpp op/experimental/quantized_dot_bias.cpp
op/experimental/quantized_dot_bias.hpp op/experimental/quantized_dot_bias.hpp
op/experimental/compiled_kernel.cpp
op/experimental/compiled_kernel.hpp
op/experimental/transpose.cpp op/experimental/transpose.cpp
op/experimental/transpose.hpp op/experimental/transpose.hpp
op/experimental/layers/ctc_greedy_decoder.cpp op/experimental/layers/ctc_greedy_decoder.cpp
......
...@@ -14,15 +14,16 @@ ...@@ -14,15 +14,16 @@
// limitations under the License. // limitations under the License.
//***************************************************************************** //*****************************************************************************
#include "ngraph/runtime/cpu/op/loop_kernel.hpp" #include "ngraph/op/experimental/compiled_kernel.hpp"
#include "ngraph/graph_util.hpp"
#include "ngraph/log.hpp" #include "ngraph/log.hpp"
#include "ngraph/util.hpp" #include "ngraph/util.hpp"
using namespace std; using namespace std;
using namespace ngraph; using namespace ngraph;
shared_ptr<Node> shared_ptr<Node> ngraph::op::CompiledKernel::copy_with_new_args(const NodeVector& new_args) const
ngraph::runtime::cpu::op::LoopKernel::copy_with_new_args(const NodeVector& new_args) const
{ {
auto args = get_arguments(); auto args = get_arguments();
if (new_args.size() != args.size()) if (new_args.size() != args.size())
...@@ -56,13 +57,13 @@ shared_ptr<Node> ...@@ -56,13 +57,13 @@ shared_ptr<Node>
new_outputs.push_back(nm.at(o.get())); new_outputs.push_back(nm.at(o.get()));
} }
return std::make_shared<LoopKernel>(new_node_list, new_outputs, new_args); return std::make_shared<CompiledKernel>(new_node_list, new_outputs, new_args);
} }
ngraph::runtime::cpu::op::LoopKernel::LoopKernel(const NodeVector& node_list, ngraph::op::CompiledKernel::CompiledKernel(const NodeVector& node_list,
const NodeVector& outputs, const NodeVector& outputs,
const NodeVector& args) const NodeVector& args)
: Op("LoopKernel", check_single_output_args({args})) : Op("CompiledKernel", check_single_output_args({args}))
, m_node_list(node_list) , m_node_list(node_list)
, m_output_nodes(outputs) , m_output_nodes(outputs)
{ {
......
...@@ -21,30 +21,28 @@ ...@@ -21,30 +21,28 @@
namespace ngraph namespace ngraph
{ {
namespace runtime namespace op
{ {
namespace cpu /// \brief CompiledKernel represents a sub-graph that can be compiled and executed
/// independently.
///
/// This op can be used to delimit sub-graphs that with special compilation requirements
/// within a function. For example, we currently use it to delimit sub-graphs that will be
/// independently compiled and executed by MLIR backend.
class CompiledKernel : public ngraph::op::Op
{ {
namespace op public:
{ CompiledKernel(const NodeVector& node_list,
/// \brief LoopKernel represents graphs consisting const NodeVector& outputs,
/// of arithmetic operations that can be executed in the same loop const NodeVector& args);
class LoopKernel : public ngraph::op::Op virtual std::shared_ptr<Node>
{ copy_with_new_args(const NodeVector& new_args) const override;
public:
LoopKernel(const NodeVector& node_list,
const NodeVector& outputs,
const NodeVector& args);
virtual std::shared_ptr<Node>
copy_with_new_args(const NodeVector& new_args) const override;
const NodeVector& get_node_list() const { return m_node_list; } const NodeVector& get_node_list() const { return m_node_list; }
const NodeVector& get_kernel_outputs() const { return m_output_nodes; } const NodeVector& get_kernel_outputs() const { return m_output_nodes; }
private: private:
NodeVector m_node_list; NodeVector m_node_list;
NodeVector m_output_nodes; NodeVector m_output_nodes;
}; };
}
}
} }
} }
...@@ -101,7 +101,6 @@ set(SRC ...@@ -101,7 +101,6 @@ set(SRC
op/group_conv_bias.cpp op/group_conv_bias.cpp
op/halide_op.cpp op/halide_op.cpp
op/leaky_relu.cpp op/leaky_relu.cpp
op/loop_kernel.cpp
op/lstm.cpp op/lstm.cpp
op/matmul_bias.cpp op/matmul_bias.cpp
op/max_pool_with_indices.cpp op/max_pool_with_indices.cpp
...@@ -111,10 +110,10 @@ set(SRC ...@@ -111,10 +110,10 @@ set(SRC
op/update_slice.cpp op/update_slice.cpp
pass/cpu_assignment.cpp pass/cpu_assignment.cpp
pass/cpu_collapse_dims.cpp pass/cpu_collapse_dims.cpp
pass/cpu_compiled_kernel_fusion.cpp
pass/cpu_fusion.cpp pass/cpu_fusion.cpp
pass/cpu_horizontal_fusion.cpp pass/cpu_horizontal_fusion.cpp
pass/cpu_layout.cpp pass/cpu_layout.cpp
pass/cpu_loop_kernel_fusion.cpp
pass/cpu_mat_fusion.cpp pass/cpu_mat_fusion.cpp
pass/cpu_memory_assignment.cpp pass/cpu_memory_assignment.cpp
pass/cpu_memory_optimization.cpp pass/cpu_memory_optimization.cpp
...@@ -137,8 +136,8 @@ endif() ...@@ -137,8 +136,8 @@ endif()
if (NGRAPH_HALIDE) if (NGRAPH_HALIDE)
set(SRC set(SRC
${SRC} ${SRC}
builder/compiled_kernel.cpp
builder/halide_op.cpp builder/halide_op.cpp
builder/loop_kernel.cpp
builder/halide_generators.cpp builder/halide_generators.cpp
pass/halide_subgraph_extraction.cpp pass/halide_subgraph_extraction.cpp
) )
......
...@@ -35,7 +35,7 @@ ...@@ -35,7 +35,7 @@
#include "halide_generators.hpp" #include "halide_generators.hpp"
#include "ngraph/runtime/cpu/cpu_builder.hpp" #include "ngraph/runtime/cpu/cpu_builder.hpp"
#include "ngraph/runtime/cpu/op/loop_kernel.hpp" #include "ngraph/runtime/cpu/op/compiled_kernel.hpp"
using namespace std; using namespace std;
using namespace ngraph; using namespace ngraph;
...@@ -49,10 +49,10 @@ namespace ngraph ...@@ -49,10 +49,10 @@ namespace ngraph
namespace cpu namespace cpu
{ {
template <> template <>
void Builder::BUILDER_DECL(ngraph::runtime::cpu::op::LoopKernel) void Builder::BUILDER_DECL(ngraph::op::CompiledKernel)
{ {
const ngraph::runtime::cpu::op::LoopKernel* hs = const ngraph::op::CompiledKernel* hs =
static_cast<const ngraph::runtime::cpu::op::LoopKernel*>(node); static_cast<const ngraph::op::CompiledKernel*>(node);
const auto& generators = ngraph::runtime::cpu::halide::get_halide_generators(); const auto& generators = ngraph::runtime::cpu::halide::get_halide_generators();
...@@ -99,7 +99,7 @@ namespace ngraph ...@@ -99,7 +99,7 @@ namespace ngraph
//a subgraph //a subgraph
if (op->get_outputs().size() > 1) if (op->get_outputs().size() > 1)
{ {
throw ngraph_error("no multi-output ops in a LoopKernel"); throw ngraph_error("no multi-output ops in a CompiledKernel");
} }
halide_functions[op->get_output_tensor_ptr()->get_name()] = halide_functions[op->get_output_tensor_ptr()->get_name()] =
generators.at(TI(*op))(inputs); generators.at(TI(*op))(inputs);
......
...@@ -38,6 +38,7 @@ ...@@ -38,6 +38,7 @@
#include "ngraph/op/divide.hpp" #include "ngraph/op/divide.hpp"
#include "ngraph/op/equal.hpp" #include "ngraph/op/equal.hpp"
#include "ngraph/op/exp.hpp" #include "ngraph/op/exp.hpp"
#include "ngraph/op/experimental/compiled_kernel.hpp"
#include "ngraph/op/floor.hpp" #include "ngraph/op/floor.hpp"
#include "ngraph/op/get_output_element.hpp" #include "ngraph/op/get_output_element.hpp"
#include "ngraph/op/greater.hpp" #include "ngraph/op/greater.hpp"
...@@ -105,7 +106,6 @@ ...@@ -105,7 +106,6 @@
#include "ngraph/runtime/cpu/mlir/compiler.hpp" #include "ngraph/runtime/cpu/mlir/compiler.hpp"
#include "ngraph/runtime/cpu/op/convert_layout.hpp" #include "ngraph/runtime/cpu/op/convert_layout.hpp"
#include "ngraph/runtime/cpu/op/halide_op.hpp" #include "ngraph/runtime/cpu/op/halide_op.hpp"
#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
#include "ngraph/type/element_type.hpp" #include "ngraph/type/element_type.hpp"
#include "ngraph/util.hpp" #include "ngraph/util.hpp"
...@@ -444,8 +444,8 @@ namespace ngraph ...@@ -444,8 +444,8 @@ namespace ngraph
{ {
static BuildOpMap build_dispatcher{ static BuildOpMap build_dispatcher{
{TI(ngraph::op::Parameter), &runtime::cpu::Builder::nop}, {TI(ngraph::op::Parameter), &runtime::cpu::Builder::nop},
{TI(ngraph::runtime::cpu::op::LoopKernel), {TI(ngraph::op::CompiledKernel),
&runtime::cpu::Builder::build<ngraph::runtime::cpu::op::LoopKernel>}, &runtime::cpu::Builder::build<ngraph::op::CompiledKernel>},
{TI(ngraph::runtime::cpu::op::HalideOp), {TI(ngraph::runtime::cpu::op::HalideOp),
&runtime::cpu::Builder::build<ngraph::runtime::cpu::op::HalideOp>}}; &runtime::cpu::Builder::build<ngraph::runtime::cpu::op::HalideOp>}};
......
...@@ -117,13 +117,13 @@ ...@@ -117,13 +117,13 @@
#include "ngraph/runtime/cpu/op/batch_mat_mul_transpose.hpp" #include "ngraph/runtime/cpu/op/batch_mat_mul_transpose.hpp"
#include "ngraph/runtime/cpu/op/batch_norm_relu.hpp" #include "ngraph/runtime/cpu/op/batch_norm_relu.hpp"
#include "ngraph/runtime/cpu/op/bounded_relu.hpp" #include "ngraph/runtime/cpu/op/bounded_relu.hpp"
#include "ngraph/op/experimental/compiled_kernel.hpp"
#include "ngraph/runtime/cpu/op/conv_add.hpp" #include "ngraph/runtime/cpu/op/conv_add.hpp"
#include "ngraph/runtime/cpu/op/conv_relu.hpp" #include "ngraph/runtime/cpu/op/conv_relu.hpp"
#include "ngraph/runtime/cpu/op/convert_layout.hpp" #include "ngraph/runtime/cpu/op/convert_layout.hpp"
#include "ngraph/runtime/cpu/op/deconv.hpp" #include "ngraph/runtime/cpu/op/deconv.hpp"
#include "ngraph/runtime/cpu/op/group_conv_bias.hpp" #include "ngraph/runtime/cpu/op/group_conv_bias.hpp"
#include "ngraph/runtime/cpu/op/leaky_relu.hpp" #include "ngraph/runtime/cpu/op/leaky_relu.hpp"
#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
#include "ngraph/runtime/cpu/op/lstm.hpp" #include "ngraph/runtime/cpu/op/lstm.hpp"
#include "ngraph/runtime/cpu/op/matmul_bias.hpp" #include "ngraph/runtime/cpu/op/matmul_bias.hpp"
#include "ngraph/runtime/cpu/op/max_pool_with_indices.hpp" #include "ngraph/runtime/cpu/op/max_pool_with_indices.hpp"
...@@ -3818,7 +3818,7 @@ namespace ngraph ...@@ -3818,7 +3818,7 @@ namespace ngraph
std::function<std::string(const std::vector<std::string>&)>> std::function<std::string(const std::vector<std::string>&)>>
inline_emitters = initialize_inline_emitters(); inline_emitters = initialize_inline_emitters();
// GOEE doesn't see GOEs in subgraphs that are hidden inside LoopKernels // GOEE doesn't see GOEs in subgraphs that are hidden inside CompiledKernels
// we have to manually propagate the source output // we have to manually propagate the source output
static const ngraph::descriptor::Output* static const ngraph::descriptor::Output*
get_goe_input_output(ngraph::descriptor::Output* output) get_goe_input_output(ngraph::descriptor::Output* output)
...@@ -3833,22 +3833,22 @@ namespace ngraph ...@@ -3833,22 +3833,22 @@ namespace ngraph
} }
template <> template <>
void CPU_Emitter::EMITTER_DECL(ngraph::runtime::cpu::op::LoopKernel) void CPU_Emitter::EMITTER_DECL(ngraph::op::CompiledKernel)
{ {
std::unordered_map<const ngraph::descriptor::Output*, std::string> std::unordered_map<const ngraph::descriptor::Output*, std::string>
loop_symbol_table; loop_symbol_table;
// pre-fill symbol table with inputs // pre-fill symbol table with inputs
const ngraph::runtime::cpu::op::LoopKernel* clk = const ngraph::op::CompiledKernel* ck =
static_cast<const ngraph::runtime::cpu::op::LoopKernel*>(node); static_cast<const ngraph::op::CompiledKernel*>(node);
NodeVector output_nodes = clk->get_kernel_outputs(); NodeVector output_nodes = ck->get_kernel_outputs();
NodeVector node_list = clk->get_node_list(); NodeVector node_list = ck->get_node_list();
for (size_t i = 0; i < args.size(); i++) for (size_t i = 0; i < args.size(); i++)
{ {
std::string sname = std::string(args[i].get_name()) + "[i]"; std::string sname = std::string(args[i].get_name()) + "[i]";
auto entry = std::make_pair(&clk->get_inputs().at(i).get_output(), sname); auto entry = std::make_pair(&ck->get_inputs().at(i).get_output(), sname);
loop_symbol_table.insert(entry); loop_symbol_table.insert(entry);
} }
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "ngraph/log.hpp" #include "ngraph/log.hpp"
#include "ngraph/op/abs.hpp" #include "ngraph/op/abs.hpp"
#include "ngraph/op/add.hpp" #include "ngraph/op/add.hpp"
#include "ngraph/op/experimental/compiled_kernel.hpp"
#include "ngraph/op/get_output_element.hpp" #include "ngraph/op/get_output_element.hpp"
#include "ngraph/op/maximum.hpp" #include "ngraph/op/maximum.hpp"
#include "ngraph/op/minimum.hpp" #include "ngraph/op/minimum.hpp"
...@@ -31,8 +32,7 @@ ...@@ -31,8 +32,7 @@
#include "ngraph/op/subtract.hpp" #include "ngraph/op/subtract.hpp"
#include "ngraph/op/util/binary_elementwise_arithmetic.hpp" #include "ngraph/op/util/binary_elementwise_arithmetic.hpp"
#include "ngraph/op/util/unary_elementwise_arithmetic.hpp" #include "ngraph/op/util/unary_elementwise_arithmetic.hpp"
#include "ngraph/runtime/cpu/op/loop_kernel.hpp" #include "ngraph/runtime/cpu/pass/cpu_compiled_kernel_fusion.hpp"
#include "ngraph/runtime/cpu/pass/cpu_loop_kernel_fusion.hpp"
#define TI(x) std::type_index(typeid(x)) #define TI(x) std::type_index(typeid(x))
...@@ -49,10 +49,10 @@ struct LKGraph ...@@ -49,10 +49,10 @@ struct LKGraph
NodeVector m_nodes; NodeVector m_nodes;
}; };
class LoopKernelCollector class CompiledKernelCollector
{ {
public: public:
LoopKernelCollector(std::shared_ptr<Function> f, size_t min_nodes_to_fuse) CompiledKernelCollector(std::shared_ptr<Function> f, size_t min_nodes_to_fuse)
{ {
for (auto n : f->get_ordered_ops()) for (auto n : f->get_ordered_ops())
{ {
...@@ -70,13 +70,13 @@ public: ...@@ -70,13 +70,13 @@ public:
else else
{ {
auto smallest_head = m_heads.at(arg_from_fusible_group); auto smallest_head = m_heads.at(arg_from_fusible_group);
auto& lkgraph = m_graphs.at(smallest_head); auto& ckgraph = m_graphs.at(smallest_head);
lkgraph.m_nodes.push_back(n); ckgraph.m_nodes.push_back(n);
for (auto arg : n->get_arguments()) for (auto arg : n->get_arguments())
{ {
if (is_leaf(arg)) if (is_leaf(arg))
{ {
lkgraph.m_inputs.push_back(arg); ckgraph.m_inputs.push_back(arg);
} }
} }
m_heads.insert(std::make_pair(n, smallest_head)); m_heads.insert(std::make_pair(n, smallest_head));
...@@ -88,18 +88,18 @@ public: ...@@ -88,18 +88,18 @@ public:
prune_graphs(min_nodes_to_fuse); prune_graphs(min_nodes_to_fuse);
} }
const std::vector<std::shared_ptr<runtime::cpu::op::LoopKernel>> get_loop_kernels() const const std::vector<std::shared_ptr<op::CompiledKernel>> get_compiled_kernels() const
{ {
std::vector<std::shared_ptr<runtime::cpu::op::LoopKernel>> lks; std::vector<std::shared_ptr<op::CompiledKernel>> cks;
for (auto e : m_graphs) for (auto e : m_graphs)
{ {
auto& lkg = e.second; auto& ckg = e.second;
NodeVector member_outputs = ngraph::get_subgraph_outputs(lkg.m_nodes, NodeVector{}); NodeVector member_outputs = ngraph::get_subgraph_outputs(ckg.m_nodes, NodeVector{});
auto lk = std::make_shared<runtime::cpu::op::LoopKernel>( auto ck =
lkg.m_nodes, member_outputs, lkg.m_inputs); std::make_shared<op::CompiledKernel>(ckg.m_nodes, member_outputs, ckg.m_inputs);
lks.push_back(lk); cks.push_back(ck);
} }
return lks; return cks;
} }
private: private:
...@@ -172,20 +172,20 @@ private: ...@@ -172,20 +172,20 @@ private:
std::unordered_map<std::shared_ptr<Node>, std::shared_ptr<Node>> m_heads; std::unordered_map<std::shared_ptr<Node>, std::shared_ptr<Node>> m_heads;
}; };
bool ngraph::runtime::cpu::pass::CPULoopKernelFusion::run_on_function( bool ngraph::runtime::cpu::pass::CPUCompiledKernelFusion::run_on_function(
std::shared_ptr<ngraph::Function> function) std::shared_ptr<ngraph::Function> function)
{ {
LoopKernelCollector lkc(function, m_min_kernel_size); CompiledKernelCollector ckc(function, m_min_kernel_size);
auto loop_kernels = lkc.get_loop_kernels(); auto compiled_kernels = ckc.get_compiled_kernels();
for (auto lk : loop_kernels) for (auto ck : compiled_kernels)
{ {
auto outputs = lk->get_kernel_outputs(); auto outputs = ck->get_kernel_outputs();
std::set<std::shared_ptr<Node>> lk_nodes_set(lk->get_node_list().begin(), std::set<std::shared_ptr<Node>> ck_nodes_set(ck->get_node_list().begin(),
lk->get_node_list().end()); ck->get_node_list().end());
for (size_t i = 0; i < outputs.size(); i++) for (size_t i = 0; i < outputs.size(); i++)
{ {
auto ith_goe = std::make_shared<ngraph::op::GetOutputElement>(lk, i); auto ith_goe = std::make_shared<ngraph::op::GetOutputElement>(ck, i);
auto& ith_output = ith_goe->get_outputs().at(0); auto& ith_output = ith_goe->get_outputs().at(0);
if (outputs.at(i)->get_outputs().size() > 1) if (outputs.at(i)->get_outputs().size() > 1)
...@@ -203,8 +203,8 @@ bool ngraph::runtime::cpu::pass::CPULoopKernelFusion::run_on_function( ...@@ -203,8 +203,8 @@ bool ngraph::runtime::cpu::pass::CPULoopKernelFusion::run_on_function(
for (auto input : inputs_copy) for (auto input : inputs_copy)
{ {
// this user is NOT internal to this loop kernel // this user is NOT internal to this loop kernel
// so it needs to be replaced with corresponding lk's GOE // so it needs to be replaced with corresponding ck's GOE
if (lk_nodes_set.count(input->get_node()) == 0) if (ck_nodes_set.count(input->get_node()) == 0)
{ {
input->replace_output(ith_output); input->replace_output(ith_output);
} }
...@@ -212,5 +212,5 @@ bool ngraph::runtime::cpu::pass::CPULoopKernelFusion::run_on_function( ...@@ -212,5 +212,5 @@ bool ngraph::runtime::cpu::pass::CPULoopKernelFusion::run_on_function(
} }
} }
return !loop_kernels.empty(); return !compiled_kernels.empty();
} }
...@@ -26,10 +26,10 @@ namespace ngraph ...@@ -26,10 +26,10 @@ namespace ngraph
{ {
namespace pass namespace pass
{ {
class CPULoopKernelFusion : public ngraph::pass::FunctionPass class CPUCompiledKernelFusion : public ngraph::pass::FunctionPass
{ {
public: public:
CPULoopKernelFusion(size_t min_kernel_size = 2) CPUCompiledKernelFusion(size_t min_kernel_size = 2)
: FunctionPass() : FunctionPass()
, m_min_kernel_size(min_kernel_size) , m_min_kernel_size(min_kernel_size)
{ {
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment