Commit 978691b4 authored by Diego Caballero's avatar Diego Caballero Committed by nmostafa

[MLIR] Rename LoopKernel->ComputedKernel. Move it to experimental core ops (#12)

We want to use ComputedKernel for any target to delimit sub-graphs to be
compiled and executed with MLIR.
parent d9dd03ce
......@@ -168,6 +168,8 @@ set (SRC
op/experimental/quantized_dot.hpp
op/experimental/quantized_dot_bias.cpp
op/experimental/quantized_dot_bias.hpp
op/experimental/compiled_kernel.cpp
op/experimental/compiled_kernel.hpp
op/experimental/transpose.cpp
op/experimental/transpose.hpp
op/experimental/layers/ctc_greedy_decoder.cpp
......
......@@ -14,15 +14,16 @@
// limitations under the License.
//*****************************************************************************
#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
#include "ngraph/op/experimental/compiled_kernel.hpp"
#include "ngraph/graph_util.hpp"
#include "ngraph/log.hpp"
#include "ngraph/util.hpp"
using namespace std;
using namespace ngraph;
shared_ptr<Node>
ngraph::runtime::cpu::op::LoopKernel::copy_with_new_args(const NodeVector& new_args) const
shared_ptr<Node> ngraph::op::CompiledKernel::copy_with_new_args(const NodeVector& new_args) const
{
auto args = get_arguments();
if (new_args.size() != args.size())
......@@ -56,13 +57,13 @@ shared_ptr<Node>
new_outputs.push_back(nm.at(o.get()));
}
return std::make_shared<LoopKernel>(new_node_list, new_outputs, new_args);
return std::make_shared<CompiledKernel>(new_node_list, new_outputs, new_args);
}
ngraph::runtime::cpu::op::LoopKernel::LoopKernel(const NodeVector& node_list,
const NodeVector& outputs,
const NodeVector& args)
: Op("LoopKernel", check_single_output_args({args}))
ngraph::op::CompiledKernel::CompiledKernel(const NodeVector& node_list,
const NodeVector& outputs,
const NodeVector& args)
: Op("CompiledKernel", check_single_output_args({args}))
, m_node_list(node_list)
, m_output_nodes(outputs)
{
......
......@@ -21,30 +21,28 @@
namespace ngraph
{
namespace runtime
namespace op
{
namespace cpu
/// \brief CompiledKernel represents a sub-graph that can be compiled and executed
/// independently.
///
/// This op can be used to delimit sub-graphs that with special compilation requirements
/// within a function. For example, we currently use it to delimit sub-graphs that will be
/// independently compiled and executed by MLIR backend.
class CompiledKernel : public ngraph::op::Op
{
namespace op
{
/// \brief LoopKernel represents graphs consisting
/// of arithmetic operations that can be executed in the same loop
class LoopKernel : public ngraph::op::Op
{
public:
LoopKernel(const NodeVector& node_list,
const NodeVector& outputs,
const NodeVector& args);
virtual std::shared_ptr<Node>
copy_with_new_args(const NodeVector& new_args) const override;
public:
CompiledKernel(const NodeVector& node_list,
const NodeVector& outputs,
const NodeVector& args);
virtual std::shared_ptr<Node>
copy_with_new_args(const NodeVector& new_args) const override;
const NodeVector& get_node_list() const { return m_node_list; }
const NodeVector& get_kernel_outputs() const { return m_output_nodes; }
private:
NodeVector m_node_list;
NodeVector m_output_nodes;
};
}
}
const NodeVector& get_node_list() const { return m_node_list; }
const NodeVector& get_kernel_outputs() const { return m_output_nodes; }
private:
NodeVector m_node_list;
NodeVector m_output_nodes;
};
}
}
......@@ -101,7 +101,6 @@ set(SRC
op/group_conv_bias.cpp
op/halide_op.cpp
op/leaky_relu.cpp
op/loop_kernel.cpp
op/lstm.cpp
op/matmul_bias.cpp
op/max_pool_with_indices.cpp
......@@ -111,10 +110,10 @@ set(SRC
op/update_slice.cpp
pass/cpu_assignment.cpp
pass/cpu_collapse_dims.cpp
pass/cpu_compiled_kernel_fusion.cpp
pass/cpu_fusion.cpp
pass/cpu_horizontal_fusion.cpp
pass/cpu_layout.cpp
pass/cpu_loop_kernel_fusion.cpp
pass/cpu_mat_fusion.cpp
pass/cpu_memory_assignment.cpp
pass/cpu_memory_optimization.cpp
......@@ -137,8 +136,8 @@ endif()
if (NGRAPH_HALIDE)
set(SRC
${SRC}
builder/compiled_kernel.cpp
builder/halide_op.cpp
builder/loop_kernel.cpp
builder/halide_generators.cpp
pass/halide_subgraph_extraction.cpp
)
......
......@@ -35,7 +35,7 @@
#include "halide_generators.hpp"
#include "ngraph/runtime/cpu/cpu_builder.hpp"
#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
#include "ngraph/runtime/cpu/op/compiled_kernel.hpp"
using namespace std;
using namespace ngraph;
......@@ -49,10 +49,10 @@ namespace ngraph
namespace cpu
{
template <>
void Builder::BUILDER_DECL(ngraph::runtime::cpu::op::LoopKernel)
void Builder::BUILDER_DECL(ngraph::op::CompiledKernel)
{
const ngraph::runtime::cpu::op::LoopKernel* hs =
static_cast<const ngraph::runtime::cpu::op::LoopKernel*>(node);
const ngraph::op::CompiledKernel* hs =
static_cast<const ngraph::op::CompiledKernel*>(node);
const auto& generators = ngraph::runtime::cpu::halide::get_halide_generators();
......@@ -99,7 +99,7 @@ namespace ngraph
//a subgraph
if (op->get_outputs().size() > 1)
{
throw ngraph_error("no multi-output ops in a LoopKernel");
throw ngraph_error("no multi-output ops in a CompiledKernel");
}
halide_functions[op->get_output_tensor_ptr()->get_name()] =
generators.at(TI(*op))(inputs);
......
......@@ -38,6 +38,7 @@
#include "ngraph/op/divide.hpp"
#include "ngraph/op/equal.hpp"
#include "ngraph/op/exp.hpp"
#include "ngraph/op/experimental/compiled_kernel.hpp"
#include "ngraph/op/floor.hpp"
#include "ngraph/op/get_output_element.hpp"
#include "ngraph/op/greater.hpp"
......@@ -105,7 +106,6 @@
#include "ngraph/runtime/cpu/mlir/compiler.hpp"
#include "ngraph/runtime/cpu/op/convert_layout.hpp"
#include "ngraph/runtime/cpu/op/halide_op.hpp"
#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
#include "ngraph/type/element_type.hpp"
#include "ngraph/util.hpp"
......@@ -444,8 +444,8 @@ namespace ngraph
{
static BuildOpMap build_dispatcher{
{TI(ngraph::op::Parameter), &runtime::cpu::Builder::nop},
{TI(ngraph::runtime::cpu::op::LoopKernel),
&runtime::cpu::Builder::build<ngraph::runtime::cpu::op::LoopKernel>},
{TI(ngraph::op::CompiledKernel),
&runtime::cpu::Builder::build<ngraph::op::CompiledKernel>},
{TI(ngraph::runtime::cpu::op::HalideOp),
&runtime::cpu::Builder::build<ngraph::runtime::cpu::op::HalideOp>}};
......
......@@ -117,13 +117,13 @@
#include "ngraph/runtime/cpu/op/batch_mat_mul_transpose.hpp"
#include "ngraph/runtime/cpu/op/batch_norm_relu.hpp"
#include "ngraph/runtime/cpu/op/bounded_relu.hpp"
#include "ngraph/op/experimental/compiled_kernel.hpp"
#include "ngraph/runtime/cpu/op/conv_add.hpp"
#include "ngraph/runtime/cpu/op/conv_relu.hpp"
#include "ngraph/runtime/cpu/op/convert_layout.hpp"
#include "ngraph/runtime/cpu/op/deconv.hpp"
#include "ngraph/runtime/cpu/op/group_conv_bias.hpp"
#include "ngraph/runtime/cpu/op/leaky_relu.hpp"
#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
#include "ngraph/runtime/cpu/op/lstm.hpp"
#include "ngraph/runtime/cpu/op/matmul_bias.hpp"
#include "ngraph/runtime/cpu/op/max_pool_with_indices.hpp"
......@@ -3818,7 +3818,7 @@ namespace ngraph
std::function<std::string(const std::vector<std::string>&)>>
inline_emitters = initialize_inline_emitters();
// GOEE doesn't see GOEs in subgraphs that are hidden inside LoopKernels
// GOEE doesn't see GOEs in subgraphs that are hidden inside CompiledKernels
// we have to manually propagate the source output
static const ngraph::descriptor::Output*
get_goe_input_output(ngraph::descriptor::Output* output)
......@@ -3833,22 +3833,22 @@ namespace ngraph
}
template <>
void CPU_Emitter::EMITTER_DECL(ngraph::runtime::cpu::op::LoopKernel)
void CPU_Emitter::EMITTER_DECL(ngraph::op::CompiledKernel)
{
std::unordered_map<const ngraph::descriptor::Output*, std::string>
loop_symbol_table;
// pre-fill symbol table with inputs
const ngraph::runtime::cpu::op::LoopKernel* clk =
static_cast<const ngraph::runtime::cpu::op::LoopKernel*>(node);
const ngraph::op::CompiledKernel* ck =
static_cast<const ngraph::op::CompiledKernel*>(node);
NodeVector output_nodes = clk->get_kernel_outputs();
NodeVector node_list = clk->get_node_list();
NodeVector output_nodes = ck->get_kernel_outputs();
NodeVector node_list = ck->get_node_list();
for (size_t i = 0; i < args.size(); i++)
{
std::string sname = std::string(args[i].get_name()) + "[i]";
auto entry = std::make_pair(&clk->get_inputs().at(i).get_output(), sname);
auto entry = std::make_pair(&ck->get_inputs().at(i).get_output(), sname);
loop_symbol_table.insert(entry);
}
......
......@@ -23,6 +23,7 @@
#include "ngraph/log.hpp"
#include "ngraph/op/abs.hpp"
#include "ngraph/op/add.hpp"
#include "ngraph/op/experimental/compiled_kernel.hpp"
#include "ngraph/op/get_output_element.hpp"
#include "ngraph/op/maximum.hpp"
#include "ngraph/op/minimum.hpp"
......@@ -31,8 +32,7 @@
#include "ngraph/op/subtract.hpp"
#include "ngraph/op/util/binary_elementwise_arithmetic.hpp"
#include "ngraph/op/util/unary_elementwise_arithmetic.hpp"
#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
#include "ngraph/runtime/cpu/pass/cpu_loop_kernel_fusion.hpp"
#include "ngraph/runtime/cpu/pass/cpu_compiled_kernel_fusion.hpp"
#define TI(x) std::type_index(typeid(x))
......@@ -49,10 +49,10 @@ struct LKGraph
NodeVector m_nodes;
};
class LoopKernelCollector
class CompiledKernelCollector
{
public:
LoopKernelCollector(std::shared_ptr<Function> f, size_t min_nodes_to_fuse)
CompiledKernelCollector(std::shared_ptr<Function> f, size_t min_nodes_to_fuse)
{
for (auto n : f->get_ordered_ops())
{
......@@ -70,13 +70,13 @@ public:
else
{
auto smallest_head = m_heads.at(arg_from_fusible_group);
auto& lkgraph = m_graphs.at(smallest_head);
lkgraph.m_nodes.push_back(n);
auto& ckgraph = m_graphs.at(smallest_head);
ckgraph.m_nodes.push_back(n);
for (auto arg : n->get_arguments())
{
if (is_leaf(arg))
{
lkgraph.m_inputs.push_back(arg);
ckgraph.m_inputs.push_back(arg);
}
}
m_heads.insert(std::make_pair(n, smallest_head));
......@@ -88,18 +88,18 @@ public:
prune_graphs(min_nodes_to_fuse);
}
const std::vector<std::shared_ptr<runtime::cpu::op::LoopKernel>> get_loop_kernels() const
const std::vector<std::shared_ptr<op::CompiledKernel>> get_compiled_kernels() const
{
std::vector<std::shared_ptr<runtime::cpu::op::LoopKernel>> lks;
std::vector<std::shared_ptr<op::CompiledKernel>> cks;
for (auto e : m_graphs)
{
auto& lkg = e.second;
NodeVector member_outputs = ngraph::get_subgraph_outputs(lkg.m_nodes, NodeVector{});
auto lk = std::make_shared<runtime::cpu::op::LoopKernel>(
lkg.m_nodes, member_outputs, lkg.m_inputs);
lks.push_back(lk);
auto& ckg = e.second;
NodeVector member_outputs = ngraph::get_subgraph_outputs(ckg.m_nodes, NodeVector{});
auto ck =
std::make_shared<op::CompiledKernel>(ckg.m_nodes, member_outputs, ckg.m_inputs);
cks.push_back(ck);
}
return lks;
return cks;
}
private:
......@@ -172,20 +172,20 @@ private:
std::unordered_map<std::shared_ptr<Node>, std::shared_ptr<Node>> m_heads;
};
bool ngraph::runtime::cpu::pass::CPULoopKernelFusion::run_on_function(
bool ngraph::runtime::cpu::pass::CPUCompiledKernelFusion::run_on_function(
std::shared_ptr<ngraph::Function> function)
{
LoopKernelCollector lkc(function, m_min_kernel_size);
auto loop_kernels = lkc.get_loop_kernels();
CompiledKernelCollector ckc(function, m_min_kernel_size);
auto compiled_kernels = ckc.get_compiled_kernels();
for (auto lk : loop_kernels)
for (auto ck : compiled_kernels)
{
auto outputs = lk->get_kernel_outputs();
std::set<std::shared_ptr<Node>> lk_nodes_set(lk->get_node_list().begin(),
lk->get_node_list().end());
auto outputs = ck->get_kernel_outputs();
std::set<std::shared_ptr<Node>> ck_nodes_set(ck->get_node_list().begin(),
ck->get_node_list().end());
for (size_t i = 0; i < outputs.size(); i++)
{
auto ith_goe = std::make_shared<ngraph::op::GetOutputElement>(lk, i);
auto ith_goe = std::make_shared<ngraph::op::GetOutputElement>(ck, i);
auto& ith_output = ith_goe->get_outputs().at(0);
if (outputs.at(i)->get_outputs().size() > 1)
......@@ -203,8 +203,8 @@ bool ngraph::runtime::cpu::pass::CPULoopKernelFusion::run_on_function(
for (auto input : inputs_copy)
{
// this user is NOT internal to this loop kernel
// so it needs to be replaced with corresponding lk's GOE
if (lk_nodes_set.count(input->get_node()) == 0)
// so it needs to be replaced with corresponding ck's GOE
if (ck_nodes_set.count(input->get_node()) == 0)
{
input->replace_output(ith_output);
}
......@@ -212,5 +212,5 @@ bool ngraph::runtime::cpu::pass::CPULoopKernelFusion::run_on_function(
}
}
return !loop_kernels.empty();
return !compiled_kernels.empty();
}
......@@ -26,10 +26,10 @@ namespace ngraph
{
namespace pass
{
class CPULoopKernelFusion : public ngraph::pass::FunctionPass
class CPUCompiledKernelFusion : public ngraph::pass::FunctionPass
{
public:
CPULoopKernelFusion(size_t min_kernel_size = 2)
CPUCompiledKernelFusion(size_t min_kernel_size = 2)
: FunctionPass()
, m_min_kernel_size(min_kernel_size)
{
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment