[MLIR] Rename LoopKernel->ComputedKernel. Move it to experimental core ops (#12)

We want to use ComputedKernel for any target to delimit sub-graphs to be compiled and executed with MLIR.

[MLIR] Rename LoopKernel->ComputedKernel. Move it to experimental core ops (#12)
We want to use ComputedKernel for any target to delimit sub-graphs to be compiled and executed with MLIR.
978691b4 · Diego Caballero · nmostafa · d9dd03ce · 978691b4 · 978691b4
Commit 978691b4 authored May 10, 2019 by Diego Caballero Committed by nmostafa Jun 02, 2019
11 changed files
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -168,6 +168,8 @@ set (SRC
    op/experimental/quantized_dot.hpp
    op/experimental/quantized_dot_bias.cpp
    op/experimental/quantized_dot_bias.hpp
+    op/experimental/compiled_kernel.cpp
+    op/experimental/compiled_kernel.hpp
    op/experimental/transpose.cpp
    op/experimental/transpose.hpp
    op/experimental/layers/ctc_greedy_decoder.cpp

--- a/src/ngraph/runtime/cpu/op/loop_kernel.cpp
+++ b/src/ngraph/runtime/cpu/op/loop_kernel.cpp
@@ -14,15 +14,16 @@
 // limitations under the License.
 //*****************************************************************************
-#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
+#include "ngraph/op/experimental/compiled_kernel.hpp"
+#include "ngraph/graph_util.hpp"
 #include "ngraph/log.hpp"
 #include "ngraph/util.hpp"
 using namespace std;
 using namespace ngraph;
-shared_ptr<Node>
+shared_ptr<Node> ngraph::op::CompiledKernel::copy_with_new_args(const NodeVector& new_args) const
-    ngraph::runtime::cpu::op::LoopKernel::copy_with_new_args(const NodeVector& new_args) const
 {
    auto args = get_arguments();
    if (new_args.size() != args.size())
@@ -56,13 +57,13 @@ shared_ptr<Node>
        new_outputs.push_back(nm.at(o.get()));
    }
-    return std::make_shared<LoopKernel>(new_node_list, new_outputs, new_args);
+    return std::make_shared<CompiledKernel>(new_node_list, new_outputs, new_args);
 }
-ngraph::runtime::cpu::op::LoopKernel::LoopKernel(const NodeVector& node_list,
+ngraph::op::CompiledKernel::CompiledKernel(const NodeVector& node_list,
-                                                 const NodeVector& outputs,
+                                           const NodeVector& outputs,
-                                                 const NodeVector& args)
+                                           const NodeVector& args)
-    : Op("LoopKernel", check_single_output_args({args}))
+    : Op("CompiledKernel", check_single_output_args({args}))
    , m_node_list(node_list)
    , m_output_nodes(outputs)
 {

--- a/src/ngraph/runtime/cpu/op/loop_kernel.hpp
+++ b/src/ngraph/runtime/cpu/op/loop_kernel.hpp
@@ -21,30 +21,28 @@
 namespace ngraph
 {
-    namespace runtime
+    namespace op
    {
-        namespace cpu
+        /// \brief CompiledKernel represents a sub-graph that can be compiled and executed
+        /// independently.
+        ///
+        /// This op can be used to delimit sub-graphs that with special compilation requirements
+        /// within a function. For example, we currently use it to delimit sub-graphs that will be
+        /// independently compiled and executed by MLIR backend.
+        class CompiledKernel : public ngraph::op::Op
        {
-            namespace op
+        public:
-            {
+            CompiledKernel(const NodeVector& node_list,
-                /// \brief LoopKernel represents graphs consisting
+                           const NodeVector& outputs,
-                /// of arithmetic operations that can be executed in the same loop
+                           const NodeVector& args);
-                class LoopKernel : public ngraph::op::Op
+            virtual std::shared_ptr<Node>
-                {
+                copy_with_new_args(const NodeVector& new_args) const override;
-                public:
-                    LoopKernel(const NodeVector& node_list,
-                               const NodeVector& outputs,
-                               const NodeVector& args);
-                    virtual std::shared_ptr<Node>
-                        copy_with_new_args(const NodeVector& new_args) const override;
-                    const NodeVector& get_node_list() const { return m_node_list; }
+            const NodeVector& get_node_list() const { return m_node_list; }
-                    const NodeVector& get_kernel_outputs() const { return m_output_nodes; }
+            const NodeVector& get_kernel_outputs() const { return m_output_nodes; }
-                private:
+        private:
-                    NodeVector m_node_list;
+            NodeVector m_node_list;
-                    NodeVector m_output_nodes;
+            NodeVector m_output_nodes;
-                };
+        };
-            }
-        }
    }
 }
--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -101,7 +101,6 @@ set(SRC
    op/group_conv_bias.cpp
    op/halide_op.cpp
    op/leaky_relu.cpp
-    op/loop_kernel.cpp
    op/lstm.cpp
    op/matmul_bias.cpp
    op/max_pool_with_indices.cpp
@@ -111,10 +110,10 @@ set(SRC
    op/update_slice.cpp
    pass/cpu_assignment.cpp
    pass/cpu_collapse_dims.cpp
+    pass/cpu_compiled_kernel_fusion.cpp
    pass/cpu_fusion.cpp
    pass/cpu_horizontal_fusion.cpp
    pass/cpu_layout.cpp
-    pass/cpu_loop_kernel_fusion.cpp
    pass/cpu_mat_fusion.cpp
    pass/cpu_memory_assignment.cpp
    pass/cpu_memory_optimization.cpp
@@ -137,8 +136,8 @@ endif()
 if (NGRAPH_HALIDE)
    set(SRC
        ${SRC}
+        builder/compiled_kernel.cpp
        builder/halide_op.cpp
-        builder/loop_kernel.cpp
        builder/halide_generators.cpp
        pass/halide_subgraph_extraction.cpp
        )

--- a/src/ngraph/runtime/cpu/builder/loop_kernel.cpp
+++ b/src/ngraph/runtime/cpu/builder/loop_kernel.cpp
@@ -35,7 +35,7 @@
 #include "halide_generators.hpp"
 #include "ngraph/runtime/cpu/cpu_builder.hpp"
-#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
+#include "ngraph/runtime/cpu/op/compiled_kernel.hpp"
 using namespace std;
 using namespace ngraph;
@@ -49,10 +49,10 @@ namespace ngraph
        namespace cpu
        {
            template <>
-            void Builder::BUILDER_DECL(ngraph::runtime::cpu::op::LoopKernel)
+            void Builder::BUILDER_DECL(ngraph::op::CompiledKernel)
            {
-                const ngraph::runtime::cpu::op::LoopKernel* hs =
+                const ngraph::op::CompiledKernel* hs =
-                    static_cast<const ngraph::runtime::cpu::op::LoopKernel*>(node);
+                    static_cast<const ngraph::op::CompiledKernel*>(node);
                const auto& generators = ngraph::runtime::cpu::halide::get_halide_generators();
@@ -99,7 +99,7 @@ namespace ngraph
                    //a subgraph
                    if (op->get_outputs().size() > 1)
                    {
-                        throw ngraph_error("no multi-output ops in a LoopKernel");
+                        throw ngraph_error("no multi-output ops in a CompiledKernel");
                    }
                    halide_functions[op->get_output_tensor_ptr()->get_name()] =
                        generators.at(TI(*op))(inputs);

--- a/src/ngraph/runtime/cpu/cpu_builder.cpp
+++ b/src/ngraph/runtime/cpu/cpu_builder.cpp
@@ -38,6 +38,7 @@
 #include "ngraph/op/divide.hpp"
 #include "ngraph/op/equal.hpp"
 #include "ngraph/op/exp.hpp"
+#include "ngraph/op/experimental/compiled_kernel.hpp"
 #include "ngraph/op/floor.hpp"
 #include "ngraph/op/get_output_element.hpp"
 #include "ngraph/op/greater.hpp"
@@ -105,7 +106,6 @@
 #include "ngraph/runtime/cpu/mlir/compiler.hpp"
 #include "ngraph/runtime/cpu/op/convert_layout.hpp"
 #include "ngraph/runtime/cpu/op/halide_op.hpp"
-#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
 #include "ngraph/type/element_type.hpp"
 #include "ngraph/util.hpp"
@@ -444,8 +444,8 @@ namespace ngraph
            {
                static BuildOpMap build_dispatcher{
                    {TI(ngraph::op::Parameter), &runtime::cpu::Builder::nop},
-                    {TI(ngraph::runtime::cpu::op::LoopKernel),
+                    {TI(ngraph::op::CompiledKernel),
-                     &runtime::cpu::Builder::build<ngraph::runtime::cpu::op::LoopKernel>},
+                     &runtime::cpu::Builder::build<ngraph::op::CompiledKernel>},
                    {TI(ngraph::runtime::cpu::op::HalideOp),
                     &runtime::cpu::Builder::build<ngraph::runtime::cpu::op::HalideOp>}};

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -117,13 +117,13 @@
 #include "ngraph/runtime/cpu/op/batch_mat_mul_transpose.hpp"
 #include "ngraph/runtime/cpu/op/batch_norm_relu.hpp"
 #include "ngraph/runtime/cpu/op/bounded_relu.hpp"
+#include "ngraph/op/experimental/compiled_kernel.hpp"
 #include "ngraph/runtime/cpu/op/conv_add.hpp"
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/convert_layout.hpp"
 #include "ngraph/runtime/cpu/op/deconv.hpp"
 #include "ngraph/runtime/cpu/op/group_conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/leaky_relu.hpp"
-#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
 #include "ngraph/runtime/cpu/op/lstm.hpp"
 #include "ngraph/runtime/cpu/op/matmul_bias.hpp"
 #include "ngraph/runtime/cpu/op/max_pool_with_indices.hpp"
@@ -3818,7 +3818,7 @@ namespace ngraph
                                      std::function<std::string(const std::vector<std::string>&)>>
                inline_emitters = initialize_inline_emitters();
-            // GOEE doesn't see GOEs in subgraphs that are hidden inside LoopKernels
+            // GOEE doesn't see GOEs in subgraphs that are hidden inside CompiledKernels
            // we have to manually propagate the source output
            static const ngraph::descriptor::Output*
                get_goe_input_output(ngraph::descriptor::Output* output)
@@ -3833,22 +3833,22 @@ namespace ngraph
            }
            template <>
-            void CPU_Emitter::EMITTER_DECL(ngraph::runtime::cpu::op::LoopKernel)
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::CompiledKernel)
            {
                std::unordered_map<const ngraph::descriptor::Output*, std::string>
                    loop_symbol_table;
                // pre-fill symbol table with inputs
-                const ngraph::runtime::cpu::op::LoopKernel* clk =
+                const ngraph::op::CompiledKernel* ck =
-                    static_cast<const ngraph::runtime::cpu::op::LoopKernel*>(node);
+                    static_cast<const ngraph::op::CompiledKernel*>(node);
-                NodeVector output_nodes = clk->get_kernel_outputs();
+                NodeVector output_nodes = ck->get_kernel_outputs();
-                NodeVector node_list = clk->get_node_list();
+                NodeVector node_list = ck->get_node_list();
                for (size_t i = 0; i < args.size(); i++)
                {
                    std::string sname = std::string(args[i].get_name()) + "[i]";
-                    auto entry = std::make_pair(&clk->get_inputs().at(i).get_output(), sname);
+                    auto entry = std::make_pair(&ck->get_inputs().at(i).get_output(), sname);
                    loop_symbol_table.insert(entry);
                }

--- a/src/ngraph/runtime/cpu/pass/cpu_loop_kernel_fusion.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_loop_kernel_fusion.cpp
@@ -23,6 +23,7 @@
 #include "ngraph/log.hpp"
 #include "ngraph/op/abs.hpp"
 #include "ngraph/op/add.hpp"
+#include "ngraph/op/experimental/compiled_kernel.hpp"
 #include "ngraph/op/get_output_element.hpp"
 #include "ngraph/op/maximum.hpp"
 #include "ngraph/op/minimum.hpp"
@@ -31,8 +32,7 @@
 #include "ngraph/op/subtract.hpp"
 #include "ngraph/op/util/binary_elementwise_arithmetic.hpp"
 #include "ngraph/op/util/unary_elementwise_arithmetic.hpp"
-#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
+#include "ngraph/runtime/cpu/pass/cpu_compiled_kernel_fusion.hpp"
-#include "ngraph/runtime/cpu/pass/cpu_loop_kernel_fusion.hpp"
 #define TI(x) std::type_index(typeid(x))
@@ -49,10 +49,10 @@ struct LKGraph
    NodeVector m_nodes;
 };
-class LoopKernelCollector
+class CompiledKernelCollector
 {
 public:
-    LoopKernelCollector(std::shared_ptr<Function> f, size_t min_nodes_to_fuse)
+    CompiledKernelCollector(std::shared_ptr<Function> f, size_t min_nodes_to_fuse)
    {
        for (auto n : f->get_ordered_ops())
        {
@@ -70,13 +70,13 @@ public:
                else
                {
                    auto smallest_head = m_heads.at(arg_from_fusible_group);
-                    auto& lkgraph = m_graphs.at(smallest_head);
+                    auto& ckgraph = m_graphs.at(smallest_head);
-                    lkgraph.m_nodes.push_back(n);
+                    ckgraph.m_nodes.push_back(n);
                    for (auto arg : n->get_arguments())
                    {
                        if (is_leaf(arg))
                        {
-                            lkgraph.m_inputs.push_back(arg);
+                            ckgraph.m_inputs.push_back(arg);
                        }
                    }
                    m_heads.insert(std::make_pair(n, smallest_head));
@@ -88,18 +88,18 @@ public:
        prune_graphs(min_nodes_to_fuse);
    }
-    const std::vector<std::shared_ptr<runtime::cpu::op::LoopKernel>> get_loop_kernels() const
+    const std::vector<std::shared_ptr<op::CompiledKernel>> get_compiled_kernels() const
    {
-        std::vector<std::shared_ptr<runtime::cpu::op::LoopKernel>> lks;
+        std::vector<std::shared_ptr<op::CompiledKernel>> cks;
        for (auto e : m_graphs)
        {
-            auto& lkg = e.second;
+            auto& ckg = e.second;
-            NodeVector member_outputs = ngraph::get_subgraph_outputs(lkg.m_nodes, NodeVector{});
+            NodeVector member_outputs = ngraph::get_subgraph_outputs(ckg.m_nodes, NodeVector{});
-            auto lk = std::make_shared<runtime::cpu::op::LoopKernel>(
+            auto ck =
-                lkg.m_nodes, member_outputs, lkg.m_inputs);
+                std::make_shared<op::CompiledKernel>(ckg.m_nodes, member_outputs, ckg.m_inputs);
-            lks.push_back(lk);
+            cks.push_back(ck);
        }
-        return lks;
+        return cks;
    }
 private:
@@ -172,20 +172,20 @@ private:
    std::unordered_map<std::shared_ptr<Node>, std::shared_ptr<Node>> m_heads;
 };
-bool ngraph::runtime::cpu::pass::CPULoopKernelFusion::run_on_function(
+bool ngraph::runtime::cpu::pass::CPUCompiledKernelFusion::run_on_function(
    std::shared_ptr<ngraph::Function> function)
 {
-    LoopKernelCollector lkc(function, m_min_kernel_size);
+    CompiledKernelCollector ckc(function, m_min_kernel_size);
-    auto loop_kernels = lkc.get_loop_kernels();
+    auto compiled_kernels = ckc.get_compiled_kernels();
-    for (auto lk : loop_kernels)
+    for (auto ck : compiled_kernels)
    {
-        auto outputs = lk->get_kernel_outputs();
+        auto outputs = ck->get_kernel_outputs();
-        std::set<std::shared_ptr<Node>> lk_nodes_set(lk->get_node_list().begin(),
+        std::set<std::shared_ptr<Node>> ck_nodes_set(ck->get_node_list().begin(),
-                                                     lk->get_node_list().end());
+                                                     ck->get_node_list().end());
        for (size_t i = 0; i < outputs.size(); i++)
        {
-            auto ith_goe = std::make_shared<ngraph::op::GetOutputElement>(lk, i);
+            auto ith_goe = std::make_shared<ngraph::op::GetOutputElement>(ck, i);
            auto& ith_output = ith_goe->get_outputs().at(0);
            if (outputs.at(i)->get_outputs().size() > 1)
@@ -203,8 +203,8 @@ bool ngraph::runtime::cpu::pass::CPULoopKernelFusion::run_on_function(
            for (auto input : inputs_copy)
            {
                // this user is NOT internal to this loop kernel
-                // so it needs to be replaced with corresponding lk's GOE
+                // so it needs to be replaced with corresponding ck's GOE
-                if (lk_nodes_set.count(input->get_node()) == 0)
+                if (ck_nodes_set.count(input->get_node()) == 0)
                {
                    input->replace_output(ith_output);
                }
@@ -212,5 +212,5 @@ bool ngraph::runtime::cpu::pass::CPULoopKernelFusion::run_on_function(
        }
    }
-    return !loop_kernels.empty();
+    return !compiled_kernels.empty();
 }
--- a/src/ngraph/runtime/cpu/pass/cpu_loop_kernel_fusion.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_loop_kernel_fusion.hpp
@@ -26,10 +26,10 @@ namespace ngraph
        {
            namespace pass
            {
-                class CPULoopKernelFusion : public ngraph::pass::FunctionPass
+                class CPUCompiledKernelFusion : public ngraph::pass::FunctionPass
                {
                public:
-                    CPULoopKernelFusion(size_t min_kernel_size = 2)
+                    CPUCompiledKernelFusion(size_t min_kernel_size = 2)
                        : FunctionPass()
                        , m_min_kernel_size(min_kernel_size)
                    {

--- a/src/ngraph/serializer.cpp
+++ b/src/ngraph/serializer.cpp
--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp