Unverified Commit 96295aaa authored by Nick Korovaiko's avatar Nick Korovaiko Committed by GitHub

Loop Kernel Op + Tests (#1028)

* loop kernel + tests

* remove commented out code

* remove commented code; add comments

* copy_with_new_args +test

* add comment

* fix comp errors
parent 5203a301
......@@ -41,6 +41,7 @@ set(SRC
op/conv_bias.cpp
op/conv_relu.cpp
op/convert_layout.cpp
op/loop_kernel.cpp
op/lstm.cpp
op/matmul_bias.cpp
op/max_pool_with_indices.cpp
......
......@@ -98,6 +98,7 @@
#include "ngraph/runtime/cpu/op/conv_relu.hpp"
#include "ngraph/runtime/cpu/op/convert_layout.hpp"
#include "ngraph/runtime/cpu/op/group_conv.hpp"
#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
#include "ngraph/runtime/cpu/op/lstm.hpp"
#include "ngraph/runtime/cpu/op/matmul_bias.hpp"
#include "ngraph/runtime/cpu/op/max_pool_with_indices.hpp"
......@@ -371,6 +372,7 @@ namespace ngraph
writer.block_end();
}
template <>
void CPU_Emitter::EMITTER_DECL(ngraph::op::BatchDot)
{
......@@ -4485,6 +4487,130 @@ namespace ngraph
<< " " << out[0].get_name() << ",\n"
<< " " << out[0].get_size() << ");\n";
}
#define TI(x) std::type_index(typeid(x))
static std::string emit_infix_operator(const std::string& opname,
const std::vector<std::string>& args)
{
if (args.size() != 2)
{
throw ngraph_error("args must be equal to 2");
}
return args.at(0) + " " + opname + " " + args.at(1);
}
static std::string emit_prefix_operator(const std::string& opname,
const std::vector<std::string>& args)
{
if (args.size() != 1)
{
throw ngraph_error("args must be equal to 2");
}
return opname + args.at(0);
}
static std::string emit_function_call(const std::string& opname,
const std::vector<std::string>& args)
{
return opname + "(" + join(args) + ")";
}
static std::unordered_map<std::type_index,
std::function<std::string(const std::vector<std::string>&)>>
initialize_inline_emitters()
{
auto abse =
std::bind(emit_function_call, std::string("std::abs"), std::placeholders::_1);
auto adde = std::bind(emit_infix_operator, std::string("+"), std::placeholders::_1);
auto nege =
std::bind(emit_prefix_operator, std::string("-"), std::placeholders::_1);
auto sube = std::bind(emit_infix_operator, std::string("-"), std::placeholders::_1);
return std::unordered_map<
std::type_index,
std::function<std::string(const std::vector<std::string>&)>>{
{TI(ngraph::op::Abs), abse},
{TI(ngraph::op::Add), adde},
{TI(ngraph::op::Negative), nege},
{TI(ngraph::op::Subtract), sube},
};
}
static std::unordered_map<std::type_index,
std::function<std::string(const std::vector<std::string>&)>>
inline_emitters = initialize_inline_emitters();
template <>
void CPU_Emitter::EMITTER_DECL(ngraph::runtime::cpu::op::LoopKernel)
{
std::unordered_map<std::shared_ptr<Node>, std::string> loop_symbol_table;
//pre-fill symbol table with inputs
const ngraph::runtime::cpu::op::LoopKernel* clk =
static_cast<const ngraph::runtime::cpu::op::LoopKernel*>(node);
NodeVector output_nodes = clk->get_kernel_outputs();
NodeVector node_list = clk->get_node_list();
for (size_t i = 0; i < args.size(); i++)
{
std::string sname = std::string(args[i].get_name()) + "[i]";
auto entry = std::make_pair(clk->get_argument(i), sname);
loop_symbol_table.insert(entry);
}
//add outputs so we write output values directly into their
//corresponding tensors
for (size_t i = 0; i < out.size(); i++)
{
std::string sname = std::string(out[i].get_name()) + "[i]";
auto entry = std::make_pair(output_nodes.at(i), sname);
loop_symbol_table.insert(entry);
}
std::string tmp_prefix{"tmp"};
writer << "#pragma omp parallel for\n";
writer << "for (size_t i = 0; i < " << out[0].get_size() << "; i++)\n";
writer.block_begin();
for (size_t i = 0; i < node_list.size(); i++)
{
auto op = node_list[i];
std::string tmp;
if (loop_symbol_table.count(op) == 0)
{
//"allocate" a new temp
tmp = tmp_prefix + std::to_string(i);
//remember the new temp in symbol name
auto entry = std::make_pair(op, tmp);
loop_symbol_table.insert(entry);
//declare a new tmp
writer << op->get_element_type().c_type_string() << " ";
}
else
{
//this means we are dealing with an output
tmp = loop_symbol_table.at(op);
}
//prepare arguments
std::vector<std::string> sargs;
for (auto arg : op->get_arguments())
{
//args are expected to be in a map already
sargs.push_back(loop_symbol_table.at(arg));
}
const Node& n = *op;
auto emitter = inline_emitters.at(TI(n));
writer << tmp << " = " << emitter(sargs) << ";\n";
}
writer.block_end();
}
#undef TI
}
}
}
......
......@@ -124,6 +124,7 @@
#include "ngraph/runtime/cpu/op/conv_relu.hpp"
#include "ngraph/runtime/cpu/op/convert_layout.hpp"
#include "ngraph/runtime/cpu/op/group_conv.hpp"
#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
#include "ngraph/runtime/cpu/op/lstm.hpp"
#include "ngraph/runtime/cpu/op/matmul_bias.hpp"
#include "ngraph/runtime/cpu/op/max_pool_with_indices.hpp"
......@@ -304,6 +305,8 @@ static const runtime::cpu::OpMap dispatcher{
{TI(ngraph::op::SigmoidBackprop), &runtime::cpu::CPU_Emitter::emit<op::SigmoidBackprop>},
{TI(ngraph::op::And), &runtime::cpu::CPU_Emitter::emit<op::And>},
{TI(ngraph::op::Or), &runtime::cpu::CPU_Emitter::emit<op::Or>},
{TI(ngraph::runtime::cpu::op::LoopKernel),
&runtime::cpu::CPU_Emitter::emit<runtime::cpu::op::LoopKernel>},
};
const size_t runtime::cpu::CPU_ExternalFunction::CPU_ExternalFunction::s_memory_pool_alignment =
......
/*******************************************************************************
* Copyright 2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
#include "ngraph/graph_util.hpp"
#include "ngraph/log.hpp"
#include "ngraph/util.hpp"
using namespace std;
using namespace ngraph;
shared_ptr<Node>
ngraph::runtime::cpu::op::LoopKernel::copy_with_new_args(const NodeVector& new_args) const
{
auto args = get_arguments();
if (new_args.size() != args.size())
{
throw ngraph_error("number of arguments don't match");
}
//map inputs
NodeMap nm;
for (size_t i = 0; i < args.size(); i++)
{
nm.add(args.at(i), new_args.at(i));
}
NodeVector new_node_list;
for (auto n : m_node_list)
{
NodeVector cur_args;
for (auto a : n->get_arguments())
{
cur_args.push_back(nm.get(a));
}
auto new_n = n->copy_with_new_args(cur_args);
nm.add(n, new_n);
new_node_list.push_back(new_n);
}
NodeVector new_outputs;
for (auto o : m_outputs)
{
new_outputs.push_back(nm.get(o));
}
return std::make_shared<LoopKernel>(new_node_list, new_outputs, new_args);
}
ngraph::runtime::cpu::op::LoopKernel::LoopKernel(const NodeVector& node_list,
const NodeVector& outputs,
const NodeVector& args)
: RequiresTensorViewArgs("LoopKernel", {args})
, m_node_list(node_list)
, m_outputs(outputs)
{
auto ref = node_list.at(0);
for (auto n : node_list)
{
if (n->get_shape() != ref->get_shape() || n->get_element_type() != ref->get_element_type())
{
throw ngraph_error("types and shapes of the nodes in node_list are different");
}
}
for (auto o : outputs)
{
if (std::find(node_list.begin(), node_list.end(), o) == node_list.end())
{
throw ngraph_error(o->get_name() + " isn't in node_list");
}
add_output(o->get_element_type(), o->get_shape());
}
}
/*******************************************************************************
* Copyright 2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#pragma once
#include "ngraph/op/util/requires_tensor_view_args.hpp"
#include "ngraph/util.hpp"
namespace ngraph
{
namespace runtime
{
namespace cpu
{
namespace op
{
/// \brief LoopKernel represents graphs consisting
/// of arithmetic operations that can be executed in the same loop
class LoopKernel : public ngraph::op::util::RequiresTensorViewArgs
{
public:
LoopKernel(const NodeVector& node_list,
const NodeVector& outputs,
const NodeVector& args);
virtual std::shared_ptr<Node>
copy_with_new_args(const NodeVector& new_args) const override;
const NodeVector& get_node_list() const { return m_node_list; }
const NodeVector& get_kernel_outputs() const { return m_outputs; }
private:
NodeVector m_node_list;
NodeVector m_outputs;
};
}
}
}
}
......@@ -50,6 +50,7 @@
#include "ngraph/runtime/cpu/op/conv_relu.hpp"
#include "ngraph/runtime/cpu/op/convert_layout.hpp"
#include "ngraph/runtime/cpu/op/group_conv.hpp"
#include "ngraph/runtime/cpu/op/loop_kernel.hpp"
#include "ngraph/runtime/cpu/op/lstm.hpp"
#include "ngraph/runtime/cpu/op/matmul_bias.hpp"
#include "ngraph/runtime/cpu/op/rnn.hpp"
......@@ -1237,6 +1238,196 @@ TEST(cpu_fusion, backwards_maxpool_with_indices_n4_c1_hw4_2x2_max)
ASSERT_TRUE(read_vector<float>(output) == expected);
}
TEST(cpu_fusion, loop_kernel_one_input_one_output)
{
Shape shapeA{2, 2};
auto A = make_shared<op::Parameter>(element::i32, shapeA);
auto neg_a = make_shared<op::Negative>(A);
auto lk = make_shared<runtime::cpu::op::LoopKernel>(
NodeVector{neg_a}, NodeVector{neg_a}, NodeVector{A});
auto f = make_shared<Function>(NodeVector{lk}, op::ParameterVector{A});
auto backend = runtime::Backend::create("CPU");
shared_ptr<runtime::TensorView> a = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::TensorView> result = backend->create_tensor(element::i32, shapeA);
vector<int> dataA{1, 4, 1, 4};
copy_data(a, dataA);
vector<int> expected{-1, -4, -1, -4};
backend->call(f, {result}, {a});
EXPECT_EQ(read_vector<int>(result), expected);
}
TEST(cpu_fusion, loop_kernel_embedded_graph)
{
Shape shapeA{2, 2};
auto A = make_shared<op::Parameter>(element::i32, shapeA);
auto B = make_shared<op::Parameter>(element::i32, shapeA);
auto neg_a = make_shared<op::Negative>(A);
auto neg_b = make_shared<op::Negative>(B);
auto add = neg_a + neg_b;
auto lk = make_shared<runtime::cpu::op::LoopKernel>(
NodeVector{add}, NodeVector{add}, NodeVector{neg_a, neg_b});
auto f = make_shared<Function>(NodeVector{lk}, op::ParameterVector{A, B});
auto backend = runtime::Backend::create("CPU");
shared_ptr<runtime::TensorView> a = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::TensorView> b = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::TensorView> result = backend->create_tensor(element::i32, shapeA);
vector<int> dataA{1, 4, 1, 4};
copy_data(a, dataA);
vector<int> dataB{1, 2, 3, 4};
copy_data(b, dataB);
vector<int> expected{-2, -6, -4, -8};
backend->call(f, {result}, {a, b});
EXPECT_EQ(read_vector<int>(result), expected);
}
TEST(cpu_fusion, loop_kernel_two_inputs_one_output)
{
Shape shapeA{2, 2};
auto A = make_shared<op::Parameter>(element::i32, shapeA);
auto B = make_shared<op::Parameter>(element::i32, shapeA);
auto add = A + B;
auto lk = make_shared<runtime::cpu::op::LoopKernel>(
NodeVector{add}, NodeVector{add}, NodeVector{A, B});
auto f = make_shared<Function>(NodeVector{lk}, op::ParameterVector{A, B});
auto backend = runtime::Backend::create("CPU");
shared_ptr<runtime::TensorView> a = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::TensorView> b = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::TensorView> result = backend->create_tensor(element::i32, shapeA);
vector<int> dataA{1, 4, 1, 4};
copy_data(a, dataA);
vector<int> dataB{1, 2, 3, 4};
copy_data(b, dataB);
vector<int> expected{2, 6, 4, 8};
backend->call(f, {result}, {a, b});
EXPECT_EQ(read_vector<int>(result), expected);
}
TEST(cpu_fusion, loop_kernel_multiple_outputs)
{
Shape shapeA{2, 2};
auto A = make_shared<op::Parameter>(element::i32, shapeA);
auto B = make_shared<op::Parameter>(element::i32, shapeA);
auto C = make_shared<op::Parameter>(element::i32, shapeA);
auto D = make_shared<op::Parameter>(element::i32, shapeA);
auto neg_a = make_shared<op::Negative>(A);
auto neg_b = make_shared<op::Negative>(B);
auto add_ab = neg_a + neg_b;
auto add_cd = C + B;
auto add_cd_abs = make_shared<op::Abs>(add_cd);
auto add_ab_abs = make_shared<op::Abs>(add_ab);
auto add_aab = add_ab_abs + A;
auto add_cdd = add_cd_abs + D;
auto lk = make_shared<runtime::cpu::op::LoopKernel>(
NodeVector{neg_a, neg_b, add_ab, add_cd, add_cd_abs, add_ab_abs, add_aab, add_cdd},
NodeVector{add_aab, add_cdd, neg_b},
NodeVector{A, B, C, D});
auto add_aab_goe = std::make_shared<op::GetOutputElement>(lk, 0);
auto add_cdd_goe = std::make_shared<op::GetOutputElement>(lk, 1);
auto neg_b_goe = std::make_shared<op::GetOutputElement>(lk, 2);
auto f = make_shared<Function>(NodeVector{add_aab_goe, add_cdd_goe, neg_b_goe},
op::ParameterVector{A, B, C, D});
auto backend = runtime::Backend::create("CPU");
shared_ptr<runtime::TensorView> a = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::TensorView> b = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::TensorView> c = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::TensorView> d = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::TensorView> r1 = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::TensorView> r2 = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::TensorView> r3 = backend->create_tensor(element::i32, shapeA);
vector<int> dataA{1, 4, 1, 4};
vector<int> dataB{3, 3, 3, 9};
vector<int> dataC{1, 2, 3, 4};
vector<int> dataD{-2, 2, -1, 1};
copy_data(a, dataA);
copy_data(b, dataB);
copy_data(c, dataC);
copy_data(d, dataD);
backend->call(f, {r1, r2, r3}, {a, b, c, d});
vector<int> expected1{5, 11, 5, 17};
vector<int> expected2{2, 7, 5, 14};
vector<int> expected3{-3, -3, -3, -9};
EXPECT_EQ(read_vector<int>(r1), expected1);
EXPECT_EQ(read_vector<int>(r2), expected2);
EXPECT_EQ(read_vector<int>(r3), expected3);
}
TEST(cpu_fusion, loop_kernel_copy_with_new_args)
{
Shape shapeA{2, 2};
auto A = make_shared<op::Parameter>(element::i32, shapeA);
auto B = make_shared<op::Parameter>(element::i32, shapeA);
auto C = make_shared<op::Parameter>(element::i32, shapeA);
auto D = make_shared<op::Parameter>(element::i32, shapeA);
auto neg_a = make_shared<op::Negative>(A);
auto neg_b = make_shared<op::Negative>(B);
auto add_ab = neg_a + neg_b;
auto add_cd = C + B;
auto add_cd_abs = make_shared<op::Abs>(add_cd);
auto add_ab_abs = make_shared<op::Abs>(add_ab);
auto add_aab = add_ab_abs + A;
auto add_cdd = add_cd_abs + D;
auto lk = make_shared<runtime::cpu::op::LoopKernel>(
NodeVector{neg_a, neg_b, add_ab, add_cd, add_cd_abs, add_ab_abs, add_aab, add_cdd},
NodeVector{add_aab, add_cdd, neg_b},
NodeVector{A, B, C, D});
auto add_aab_goe = std::make_shared<op::GetOutputElement>(lk, 0);
auto add_cdd_goe = std::make_shared<op::GetOutputElement>(lk, 1);
auto neg_b_goe = std::make_shared<op::GetOutputElement>(lk, 2);
auto f = make_shared<Function>(NodeVector{add_aab_goe, add_cdd_goe, neg_b_goe},
op::ParameterVector{A, B, C, D});
auto copy_f = clone_function(*f);
auto backend = runtime::Backend::create("CPU");
shared_ptr<runtime::TensorView> a = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::TensorView> b = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::TensorView> c = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::TensorView> d = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::TensorView> r1 = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::TensorView> r2 = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::TensorView> r3 = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::TensorView> copy_r1 = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::TensorView> copy_r2 = backend->create_tensor(element::i32, shapeA);
shared_ptr<runtime::TensorView> copy_r3 = backend->create_tensor(element::i32, shapeA);
vector<int> dataA{1, 4, 1, 4};
vector<int> dataB{3, 3, 3, 9};
vector<int> dataC{1, 2, 3, 4};
vector<int> dataD{-2, 2, -1, 1};
copy_data(a, dataA);
copy_data(b, dataB);
copy_data(c, dataC);
copy_data(d, dataD);
backend->call(f, {r1, r2, r3}, {a, b, c, d});
backend->call(copy_f, {copy_r1, copy_r2, copy_r3}, {a, b, c, d});
EXPECT_EQ(read_vector<int>(r1), read_vector<int>(copy_r1));
EXPECT_EQ(read_vector<int>(r2), read_vector<int>(copy_r2));
EXPECT_EQ(read_vector<int>(r3), read_vector<int>(copy_r3));
}
static std::shared_ptr<ngraph::Function> make_forward_function()
{
Shape shape_a{10, 3, 28, 28};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment