Loop Kernel Op + Tests (#1028)

* loop kernel + tests * remove commented out code * remove commented code; add comments * copy_with_new_args +test * add comment * fix comp errors

Loop Kernel Op + Tests (#1028)
* loop kernel + tests * remove commented out code * remove commented code; add comments * copy_with_new_args +test * add comment * fix comp errors
96295aaa · Nick Korovaiko · GitHub · 5203a301 · 96295aaa · 96295aaa
Unverified Commit 96295aaa authored Jun 19, 2018 by Nick Korovaiko Committed by GitHub Jun 19, 2018
6 changed files
--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -41,6 +41,7 @@ set(SRC
    op/conv_bias.cpp
    op/conv_relu.cpp
    op/convert_layout.cpp
+    op/loop_kernel.cpp
    op/lstm.cpp
    op/matmul_bias.cpp
    op/max_pool_with_indices.cpp

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -98,6 +98,7 @@
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/convert_layout.hpp"
 #include "ngraph/runtime/cpu/op/group_conv.hpp"
+#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
 #include "ngraph/runtime/cpu/op/lstm.hpp"
 #include "ngraph/runtime/cpu/op/matmul_bias.hpp"
 #include "ngraph/runtime/cpu/op/max_pool_with_indices.hpp"
@@ -371,6 +372,7 @@ namespace ngraph

                writer.block_end();
            }
+
            template <>
            void CPU_Emitter::EMITTER_DECL(ngraph::op::BatchDot)
            {
@@ -4485,6 +4487,130 @@ namespace ngraph
                       << "                      " << out[0].get_name() << ",\n"
                       << "                      " << out[0].get_size() << ");\n";
            }
+
+#define TI(x) std::type_index(typeid(x))
+
+            static std::string emit_infix_operator(const std::string& opname,
+                                                   const std::vector<std::string>& args)
+            {
+                if (args.size() != 2)
+                {
+                    throw ngraph_error("args must be equal to 2");
+                }
+                return args.at(0) + " " + opname + " " + args.at(1);
+            }
+
+            static std::string emit_prefix_operator(const std::string& opname,
+                                                    const std::vector<std::string>& args)
+            {
+                if (args.size() != 1)
+                {
+                    throw ngraph_error("args must be equal to 2");
+                }
+                return opname + args.at(0);
+            }
+
+            static std::string emit_function_call(const std::string& opname,
+                                                  const std::vector<std::string>& args)
+            {
+                return opname + "(" + join(args) + ")";
+            }
+
+            static std::unordered_map<std::type_index,
+                                      std::function<std::string(const std::vector<std::string>&)>>
+                initialize_inline_emitters()
+            {
+                auto abse =
+                    std::bind(emit_function_call, std::string("std::abs"), std::placeholders::_1);
+                auto adde = std::bind(emit_infix_operator, std::string("+"), std::placeholders::_1);
+                auto nege =
+                    std::bind(emit_prefix_operator, std::string("-"), std::placeholders::_1);
+                auto sube = std::bind(emit_infix_operator, std::string("-"), std::placeholders::_1);
+
+                return std::unordered_map<
+                    std::type_index,
+                    std::function<std::string(const std::vector<std::string>&)>>{
+                    {TI(ngraph::op::Abs), abse},
+                    {TI(ngraph::op::Add), adde},
+                    {TI(ngraph::op::Negative), nege},
+                    {TI(ngraph::op::Subtract), sube},
+                };
+            }
+
+            static std::unordered_map<std::type_index,
+                                      std::function<std::string(const std::vector<std::string>&)>>
+                inline_emitters = initialize_inline_emitters();
+
+            template <>
+            void CPU_Emitter::EMITTER_DECL(ngraph::runtime::cpu::op::LoopKernel)
+            {
+                std::unordered_map<std::shared_ptr<Node>, std::string> loop_symbol_table;
+                //pre-fill symbol table with inputs
+
+                const ngraph::runtime::cpu::op::LoopKernel* clk =
+                    static_cast<const ngraph::runtime::cpu::op::LoopKernel*>(node);
+
+                NodeVector output_nodes = clk->get_kernel_outputs();
+                NodeVector node_list = clk->get_node_list();
+                for (size_t i = 0; i < args.size(); i++)
+                {
+                    std::string sname = std::string(args[i].get_name()) + "[i]";
+                    auto entry = std::make_pair(clk->get_argument(i), sname);
+                    loop_symbol_table.insert(entry);
+                }
+
+                //add outputs so we write output values directly into their
+                //corresponding tensors
+                for (size_t i = 0; i < out.size(); i++)
+                {
+                    std::string sname = std::string(out[i].get_name()) + "[i]";
+                    auto entry = std::make_pair(output_nodes.at(i), sname);
+                    loop_symbol_table.insert(entry);
+                }
+
+                std::string tmp_prefix{"tmp"};
+
+                writer << "#pragma omp parallel for\n";
+                writer << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
+                writer.block_begin();
+
+                for (size_t i = 0; i < node_list.size(); i++)
+                {
+                    auto op = node_list[i];
+                    std::string tmp;
+                    if (loop_symbol_table.count(op) == 0)
+                    {
+                        //"allocate" a new temp
+                        tmp = tmp_prefix + std::to_string(i);
+                        //remember the new temp in symbol name
+                        auto entry = std::make_pair(op, tmp);
+                        loop_symbol_table.insert(entry);
+                        //declare a new tmp
+                        writer << op->get_element_type().c_type_string() << " ";
+                    }
+                    else
+                    {
+                        //this means we are dealing with an output
+                        tmp = loop_symbol_table.at(op);
+                    }
+
+                    //prepare arguments
+                    std::vector<std::string> sargs;
+                    for (auto arg : op->get_arguments())
+                    {
+                        //args are expected to be in a map already
+                        sargs.push_back(loop_symbol_table.at(arg));
+                    }
+
+                    const Node& n = *op;
+                    auto emitter = inline_emitters.at(TI(n));
+                    writer << tmp << " = " << emitter(sargs) << ";\n";
+                }
+
+                writer.block_end();
+            }
+
+#undef TI
        }
    }
 }

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -124,6 +124,7 @@
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/convert_layout.hpp"
 #include "ngraph/runtime/cpu/op/group_conv.hpp"
+#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
 #include "ngraph/runtime/cpu/op/lstm.hpp"
 #include "ngraph/runtime/cpu/op/matmul_bias.hpp"
 #include "ngraph/runtime/cpu/op/max_pool_with_indices.hpp"
@@ -304,6 +305,8 @@ static const runtime::cpu::OpMap dispatcher{
    {TI(ngraph::op::SigmoidBackprop), &runtime::cpu::CPU_Emitter::emit<op::SigmoidBackprop>},
    {TI(ngraph::op::And), &runtime::cpu::CPU_Emitter::emit<op::And>},
    {TI(ngraph::op::Or), &runtime::cpu::CPU_Emitter::emit<op::Or>},
+    {TI(ngraph::runtime::cpu::op::LoopKernel),
+     &runtime::cpu::CPU_Emitter::emit<runtime::cpu::op::LoopKernel>},
 };

 const size_t runtime::cpu::CPU_ExternalFunction::CPU_ExternalFunction::s_memory_pool_alignment =

--- a/src/ngraph/runtime/cpu/op/loop_kernel.cpp
+++ b/src/ngraph/runtime/cpu/op/loop_kernel.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
+#include "ngraph/graph_util.hpp"
+#include "ngraph/log.hpp"
+#include "ngraph/util.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+shared_ptr<Node>
+    ngraph::runtime::cpu::op::LoopKernel::copy_with_new_args(const NodeVector& new_args) const
+{
+    auto args = get_arguments();
+    if (new_args.size() != args.size())
+    {
+        throw ngraph_error("number of arguments don't match");
+    }
+
+    //map inputs
+    NodeMap nm;
+    for (size_t i = 0; i < args.size(); i++)
+    {
+        nm.add(args.at(i), new_args.at(i));
+    }
+
+    NodeVector new_node_list;
+    for (auto n : m_node_list)
+    {
+        NodeVector cur_args;
+        for (auto a : n->get_arguments())
+        {
+            cur_args.push_back(nm.get(a));
+        }
+        auto new_n = n->copy_with_new_args(cur_args);
+        nm.add(n, new_n);
+        new_node_list.push_back(new_n);
+    }
+
+    NodeVector new_outputs;
+    for (auto o : m_outputs)
+    {
+        new_outputs.push_back(nm.get(o));
+    }
+
+    return std::make_shared<LoopKernel>(new_node_list, new_outputs, new_args);
+}
+
+ngraph::runtime::cpu::op::LoopKernel::LoopKernel(const NodeVector& node_list,
+                                                 const NodeVector& outputs,
+                                                 const NodeVector& args)
+    : RequiresTensorViewArgs("LoopKernel", {args})
+    , m_node_list(node_list)
+    , m_outputs(outputs)
+{
+    auto ref = node_list.at(0);
+    for (auto n : node_list)
+    {
+        if (n->get_shape() != ref->get_shape() || n->get_element_type() != ref->get_element_type())
+        {
+            throw ngraph_error("types and shapes of the nodes in node_list are different");
+        }
+    }
+
+    for (auto o : outputs)
+    {
+        if (std::find(node_list.begin(), node_list.end(), o) == node_list.end())
+        {
+            throw ngraph_error(o->get_name() + " isn't in node_list");
+        }
+        add_output(o->get_element_type(), o->get_shape());
+    }
+}
--- a/src/ngraph/runtime/cpu/op/loop_kernel.hpp
+++ b/src/ngraph/runtime/cpu/op/loop_kernel.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include "ngraph/op/util/requires_tensor_view_args.hpp"
+#include "ngraph/util.hpp"
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace op
+            {
+                /// \brief LoopKernel represents graphs consisting
+                /// of arithmetic operations that can be executed in the same loop
+                class LoopKernel : public ngraph::op::util::RequiresTensorViewArgs
+                {
+                public:
+                    LoopKernel(const NodeVector& node_list,
+                               const NodeVector& outputs,
+                               const NodeVector& args);
+                    virtual std::shared_ptr<Node>
+                        copy_with_new_args(const NodeVector& new_args) const override;
+
+                    const NodeVector& get_node_list() const { return m_node_list; }
+                    const NodeVector& get_kernel_outputs() const { return m_outputs; }
+                private:
+                    NodeVector m_node_list;
+                    NodeVector m_outputs;
+                };
+            }
+        }
+    }
+}
--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp
@@ -50,6 +50,7 @@
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/convert_layout.hpp"
 #include "ngraph/runtime/cpu/op/group_conv.hpp"
+#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
 #include "ngraph/runtime/cpu/op/lstm.hpp"
 #include "ngraph/runtime/cpu/op/matmul_bias.hpp"
 #include "ngraph/runtime/cpu/op/rnn.hpp"
@@ -1237,6 +1238,196 @@ TEST(cpu_fusion, backwards_maxpool_with_indices_n4_c1_hw4_2x2_max)
    ASSERT_TRUE(read_vector<float>(output) == expected);
 }

+TEST(cpu_fusion, loop_kernel_one_input_one_output)
+{
+    Shape shapeA{2, 2};
+    auto A = make_shared<op::Parameter>(element::i32, shapeA);
+    auto neg_a = make_shared<op::Negative>(A);
+    auto lk = make_shared<runtime::cpu::op::LoopKernel>(
+        NodeVector{neg_a}, NodeVector{neg_a}, NodeVector{A});
+    auto f = make_shared<Function>(NodeVector{lk}, op::ParameterVector{A});
+
+    auto backend = runtime::Backend::create("CPU");
+    shared_ptr<runtime::TensorView> a = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::TensorView> result = backend->create_tensor(element::i32, shapeA);
+
+    vector<int> dataA{1, 4, 1, 4};
+    copy_data(a, dataA);
+    vector<int> expected{-1, -4, -1, -4};
+
+    backend->call(f, {result}, {a});
+
+    EXPECT_EQ(read_vector<int>(result), expected);
+}
+
+TEST(cpu_fusion, loop_kernel_embedded_graph)
+{
+    Shape shapeA{2, 2};
+    auto A = make_shared<op::Parameter>(element::i32, shapeA);
+    auto B = make_shared<op::Parameter>(element::i32, shapeA);
+    auto neg_a = make_shared<op::Negative>(A);
+    auto neg_b = make_shared<op::Negative>(B);
+    auto add = neg_a + neg_b;
+    auto lk = make_shared<runtime::cpu::op::LoopKernel>(
+        NodeVector{add}, NodeVector{add}, NodeVector{neg_a, neg_b});
+    auto f = make_shared<Function>(NodeVector{lk}, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("CPU");
+    shared_ptr<runtime::TensorView> a = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::TensorView> b = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::TensorView> result = backend->create_tensor(element::i32, shapeA);
+
+    vector<int> dataA{1, 4, 1, 4};
+    copy_data(a, dataA);
+    vector<int> dataB{1, 2, 3, 4};
+    copy_data(b, dataB);
+    vector<int> expected{-2, -6, -4, -8};
+    backend->call(f, {result}, {a, b});
+    EXPECT_EQ(read_vector<int>(result), expected);
+}
+
+TEST(cpu_fusion, loop_kernel_two_inputs_one_output)
+{
+    Shape shapeA{2, 2};
+    auto A = make_shared<op::Parameter>(element::i32, shapeA);
+    auto B = make_shared<op::Parameter>(element::i32, shapeA);
+    auto add = A + B;
+    auto lk = make_shared<runtime::cpu::op::LoopKernel>(
+        NodeVector{add}, NodeVector{add}, NodeVector{A, B});
+    auto f = make_shared<Function>(NodeVector{lk}, op::ParameterVector{A, B});
+
+    auto backend = runtime::Backend::create("CPU");
+    shared_ptr<runtime::TensorView> a = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::TensorView> b = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::TensorView> result = backend->create_tensor(element::i32, shapeA);
+
+    vector<int> dataA{1, 4, 1, 4};
+    copy_data(a, dataA);
+    vector<int> dataB{1, 2, 3, 4};
+    copy_data(b, dataB);
+    vector<int> expected{2, 6, 4, 8};
+
+    backend->call(f, {result}, {a, b});
+
+    EXPECT_EQ(read_vector<int>(result), expected);
+}
+
+TEST(cpu_fusion, loop_kernel_multiple_outputs)
+{
+    Shape shapeA{2, 2};
+    auto A = make_shared<op::Parameter>(element::i32, shapeA);
+    auto B = make_shared<op::Parameter>(element::i32, shapeA);
+    auto C = make_shared<op::Parameter>(element::i32, shapeA);
+    auto D = make_shared<op::Parameter>(element::i32, shapeA);
+
+    auto neg_a = make_shared<op::Negative>(A);
+    auto neg_b = make_shared<op::Negative>(B);
+    auto add_ab = neg_a + neg_b;
+    auto add_cd = C + B;
+    auto add_cd_abs = make_shared<op::Abs>(add_cd);
+    auto add_ab_abs = make_shared<op::Abs>(add_ab);
+    auto add_aab = add_ab_abs + A;
+    auto add_cdd = add_cd_abs + D;
+
+    auto lk = make_shared<runtime::cpu::op::LoopKernel>(
+        NodeVector{neg_a, neg_b, add_ab, add_cd, add_cd_abs, add_ab_abs, add_aab, add_cdd},
+        NodeVector{add_aab, add_cdd, neg_b},
+        NodeVector{A, B, C, D});
+    auto add_aab_goe = std::make_shared<op::GetOutputElement>(lk, 0);
+    auto add_cdd_goe = std::make_shared<op::GetOutputElement>(lk, 1);
+    auto neg_b_goe = std::make_shared<op::GetOutputElement>(lk, 2);
+
+    auto f = make_shared<Function>(NodeVector{add_aab_goe, add_cdd_goe, neg_b_goe},
+                                   op::ParameterVector{A, B, C, D});
+
+    auto backend = runtime::Backend::create("CPU");
+
+    shared_ptr<runtime::TensorView> a = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::TensorView> b = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::TensorView> c = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::TensorView> d = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::TensorView> r1 = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::TensorView> r2 = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::TensorView> r3 = backend->create_tensor(element::i32, shapeA);
+
+    vector<int> dataA{1, 4, 1, 4};
+    vector<int> dataB{3, 3, 3, 9};
+    vector<int> dataC{1, 2, 3, 4};
+    vector<int> dataD{-2, 2, -1, 1};
+    copy_data(a, dataA);
+    copy_data(b, dataB);
+    copy_data(c, dataC);
+    copy_data(d, dataD);
+
+    backend->call(f, {r1, r2, r3}, {a, b, c, d});
+
+    vector<int> expected1{5, 11, 5, 17};
+    vector<int> expected2{2, 7, 5, 14};
+    vector<int> expected3{-3, -3, -3, -9};
+    EXPECT_EQ(read_vector<int>(r1), expected1);
+    EXPECT_EQ(read_vector<int>(r2), expected2);
+    EXPECT_EQ(read_vector<int>(r3), expected3);
+}
+
+TEST(cpu_fusion, loop_kernel_copy_with_new_args)
+{
+    Shape shapeA{2, 2};
+    auto A = make_shared<op::Parameter>(element::i32, shapeA);
+    auto B = make_shared<op::Parameter>(element::i32, shapeA);
+    auto C = make_shared<op::Parameter>(element::i32, shapeA);
+    auto D = make_shared<op::Parameter>(element::i32, shapeA);
+
+    auto neg_a = make_shared<op::Negative>(A);
+    auto neg_b = make_shared<op::Negative>(B);
+    auto add_ab = neg_a + neg_b;
+    auto add_cd = C + B;
+    auto add_cd_abs = make_shared<op::Abs>(add_cd);
+    auto add_ab_abs = make_shared<op::Abs>(add_ab);
+    auto add_aab = add_ab_abs + A;
+    auto add_cdd = add_cd_abs + D;
+
+    auto lk = make_shared<runtime::cpu::op::LoopKernel>(
+        NodeVector{neg_a, neg_b, add_ab, add_cd, add_cd_abs, add_ab_abs, add_aab, add_cdd},
+        NodeVector{add_aab, add_cdd, neg_b},
+        NodeVector{A, B, C, D});
+    auto add_aab_goe = std::make_shared<op::GetOutputElement>(lk, 0);
+    auto add_cdd_goe = std::make_shared<op::GetOutputElement>(lk, 1);
+    auto neg_b_goe = std::make_shared<op::GetOutputElement>(lk, 2);
+
+    auto f = make_shared<Function>(NodeVector{add_aab_goe, add_cdd_goe, neg_b_goe},
+                                   op::ParameterVector{A, B, C, D});
+
+    auto copy_f = clone_function(*f);
+
+    auto backend = runtime::Backend::create("CPU");
+
+    shared_ptr<runtime::TensorView> a = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::TensorView> b = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::TensorView> c = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::TensorView> d = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::TensorView> r1 = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::TensorView> r2 = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::TensorView> r3 = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::TensorView> copy_r1 = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::TensorView> copy_r2 = backend->create_tensor(element::i32, shapeA);
+    shared_ptr<runtime::TensorView> copy_r3 = backend->create_tensor(element::i32, shapeA);
+
+    vector<int> dataA{1, 4, 1, 4};
+    vector<int> dataB{3, 3, 3, 9};
+    vector<int> dataC{1, 2, 3, 4};
+    vector<int> dataD{-2, 2, -1, 1};
+    copy_data(a, dataA);
+    copy_data(b, dataB);
+    copy_data(c, dataC);
+    copy_data(d, dataD);
+
+    backend->call(f, {r1, r2, r3}, {a, b, c, d});
+    backend->call(copy_f, {copy_r1, copy_r2, copy_r3}, {a, b, c, d});
+
+    EXPECT_EQ(read_vector<int>(r1), read_vector<int>(copy_r1));
+    EXPECT_EQ(read_vector<int>(r2), read_vector<int>(copy_r2));
+    EXPECT_EQ(read_vector<int>(r3), read_vector<int>(copy_r3));
+}
 static std::shared_ptr<ngraph::Function> make_forward_function()
 {
    Shape shape_a{10, 3, 28, 28};