Commit 2efc0065 authored by amy.zhuang's avatar amy.zhuang

Add in place concat optimization.

parent f5d2fa68
......@@ -19,6 +19,7 @@
#include "ngraph/log.hpp"
#include "ngraph/log.hpp"
#include "ngraph/op/concat.hpp"
#include "ngraph/pass/liveness.hpp"
#include "ngraph/pass/manager.hpp"
#include "ngraph/pass/memory_layout.hpp"
......@@ -43,20 +44,25 @@ bool pass::MemoryLayout::run_on_function(shared_ptr<ngraph::Function> function)
if (auto op = std::dynamic_pointer_cast<op::Op>(node))
{
if (auto op_annotations = op->get_op_annotations())
// concat in_place_oi should be treated differently
if (!std::dynamic_pointer_cast<op::Concat>(node))
{
for (auto oi_pair : op_annotations->get_in_place_oi_pairs())
if (auto op_annotations = op->get_op_annotations())
{
auto output = &node->get_outputs().at(oi_pair.output).get_tensor();
auto input = &node->get_inputs().at(oi_pair.input).get_tensor();
auto input_node = node->get_inputs().at(oi_pair.input).get_output().get_node();
// an input tensor can be reused if this is the last use
if (node->liveness_free_list.count(input) != 0 &&
node->liveness_new_list.count(output) != 0)
for (auto oi_pair : op_annotations->get_in_place_oi_pairs())
{
in_place_outputs.insert({output, input});
reused_inputs.insert(input);
auto output = &node->get_outputs().at(oi_pair.output).get_tensor();
auto input = &node->get_inputs().at(oi_pair.input).get_tensor();
auto input_node =
node->get_inputs().at(oi_pair.input).get_output().get_node();
// an input tensor can be reused if this is the last use
if (node->liveness_free_list.count(input) != 0 &&
node->liveness_new_list.count(output) != 0)
{
in_place_outputs.insert({output, input});
reused_inputs.insert(input);
}
}
}
}
......@@ -68,6 +74,27 @@ bool pass::MemoryLayout::run_on_function(shared_ptr<ngraph::Function> function)
? in_place_outputs.at(tensor)->get_pool_offset()
: mm.allocate(tensor->size());
tensor->set_pool_offset(offset);
// check if the op is concat
if (auto concat = std::dynamic_pointer_cast<op::Concat>(node))
{
if (auto op_annotations = concat->get_op_annotations())
{
auto in_place_oi_pairs = op_annotations->get_in_place_oi_pairs();
if (in_place_oi_pairs.size() > 0)
{
for (auto arg : concat->get_arguments())
{
auto input_node = std::dynamic_pointer_cast<op::Op>(arg);
auto input_tensor = &input_node->get_output_tensor();
auto old_offset = input_tensor->get_pool_offset();
input_tensor->set_pool_offset(offset);
NGRAPH_DEBUG << "memeory_layout: change offset, old offset is "
<< old_offset << ", new offset is " << offset << std::endl;
offset += input_tensor->size();
}
}
}
}
}
if (!m_disable_memory_sharing)
......
......@@ -106,6 +106,7 @@ set(SRC
pass/cpu_layout.cpp
pass/cpu_loop_kernel_fusion.cpp
pass/cpu_mat_fusion.cpp
pass/cpu_post_layout_assignment.cpp
pass/cpu_post_layout_optimizations.cpp
pass/cpu_rnn_fusion.cpp
pass/cpu_workspace_insertion.cpp
......
......@@ -32,8 +32,9 @@ namespace ngraph
template <>
void Builder::BUILDER_DECL(ngraph::op::Concat)
{
auto axis =
(static_cast<const ngraph::op::Concat*>(node))->get_concatenation_axis();
auto concat = static_cast<const ngraph::op::Concat*>(node);
auto axis = concat->get_concatenation_axis();
auto& functors = external_function->get_functors();
......@@ -48,10 +49,42 @@ namespace ngraph
arg_shapes.emplace_back(arg.get_shape());
}
}
auto nargs = args.size();
auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
auto out_shape = out[0].get_shape();
auto element_size = concat->get_input_element_type(0).size();
if (auto op_annotations = concat->get_op_annotations())
{
auto in_place_oi_pairs = op_annotations->get_in_place_oi_pairs();
if (in_place_oi_pairs.size() > 0)
{
auto functor = [&, arg_tensors, nargs, out_shape, arg_shapes, element_size](
CPURuntimeContext* ctx) {
auto out_size = shape_size(out_shape) * element_size;
auto offset = 0;
for (size_t i = 0; i < nargs; i++)
{
auto arg_size = shape_size(arg_shapes[i]) * element_size;
if (arg_tensors[i] < out_tensor ||
arg_tensors[i] >=
reinterpret_cast<char*>(out_tensor) + out_size)
{
memcpy(reinterpret_cast<char*>(out_tensor) + offset,
arg_tensors[i],
arg_size);
}
offset += arg_size;
}
};
functors.emplace_back(functor);
return;
}
}
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
......
......@@ -993,6 +993,30 @@ namespace ngraph
template <>
void CPU_Emitter::EMITTER_DECL(ngraph::op::Concat)
{
auto concat = static_cast<const ngraph::op::Concat*>(node);
if (auto op_annotations = concat->get_op_annotations())
{
auto in_place_oi_pairs = op_annotations->get_in_place_oi_pairs();
if (in_place_oi_pairs.size() > 0)
{
auto offset = 0;
for (auto i = 0; i < args.size(); i++)
{
writer << "if (" << args[i].get_name() << " < " << out[0].get_name()
<< " || " << args[i].get_name() << " >= " << out[0].get_name()
<< " + " << out[0].get_size() << ")\n";
writer.block_begin();
writer << "memcpy(" << out[0].get_name() << " + " << offset << ", "
<< args[i].get_name() << ", "
<< args[i].get_size() * out[0].get_element_type().size()
<< ");\n";
writer.block_end();
offset += args[i].get_size();
}
return;
}
}
auto result_shape = out[0].get_shape();
#if USE_EIGEN_CORE_INLINE == 1
......
......@@ -163,6 +163,7 @@
#include "ngraph/runtime/cpu/pass/cpu_horizontal_fusion.hpp"
#include "ngraph/runtime/cpu/pass/cpu_layout.hpp"
#include "ngraph/runtime/cpu/pass/cpu_mat_fusion.hpp"
#include "ngraph/runtime/cpu/pass/cpu_post_layout_assignment.hpp"
#include "ngraph/runtime/cpu/pass/cpu_post_layout_optimizations.hpp"
#include "ngraph/runtime/cpu/pass/cpu_rnn_fusion.hpp"
#include "ngraph/runtime/cpu/pass/cpu_workspace_insertion.hpp"
......@@ -395,6 +396,7 @@ void runtime::cpu::CPU_ExternalFunction::register_common_passes(ngraph::pass::Ma
pass_manager.register_pass<runtime::cpu::pass::CPUAssignment>(this);
pass_manager.register_pass<runtime::cpu::pass::CPULayout>(this);
pass_manager.register_pass<runtime::cpu::pass::CPUPostLayoutOptimizations>();
pass_manager.register_pass<runtime::cpu::pass::CPUPostLayoutAssignment>(this);
pass_manager.register_pass<ngraph::pass::GetOutputElementElimination>();
pass_manager.get_state().set_visualize_tree_ops_map(runtime::cpu::get_visualize_tree_ops_map());
}
......@@ -636,6 +638,42 @@ using namespace ngraph::runtime;
}
}
// concat
for (shared_ptr<Node> node : ordered_ops)
{
if (auto concat = std::dynamic_pointer_cast<ngraph::op::Concat>(node))
{
if (auto op_annotations = concat->get_op_annotations())
{
auto in_place_oi_pairs = op_annotations->get_in_place_oi_pairs();
if (in_place_oi_pairs.size() > 0)
{
bool found_last_concat = true;
for (auto user : concat->get_users())
{
if (dynamic_pointer_cast<ngraph::op::Concat>(user))
{
found_last_concat = false;
break;
}
}
if (found_last_concat)
{
for (auto arg : concat->get_arguments())
{
if (auto arg_concat = dynamic_pointer_cast<ngraph::op::Concat>(arg))
{
NGRAPH_DEBUG << "call propagate_in_place_concat for "
<< arg->get_name() << std::endl;
propagate_in_place_concat(arg_concat);
}
}
}
}
}
}
}
writer << "bool " << current_function->get_name() << "_t_en[" << tensor_index << "];\n";
writer << "extern \"C\" void " << current_function->get_name();
......@@ -1128,6 +1166,42 @@ void runtime::cpu::CPU_ExternalFunction::propagate_in_place_output(
} while (propagate_further);
}
void runtime::cpu::CPU_ExternalFunction::propagate_in_place_concat(
shared_ptr<ngraph::op::Concat> concat)
{
std::deque<std::shared_ptr<ngraph::op::Concat>> stack;
stack.push_front(concat);
while (stack.size() > 0)
{
auto it = stack.front();
stack.pop_front();
if (auto op_annotations = it->get_op_annotations())
{
auto in_place_oi_pairs = op_annotations->get_in_place_oi_pairs();
if (in_place_oi_pairs.size() > 0)
{
auto output_tensor = &it->get_output_tensor();
auto offset = output_tensor->get_pool_offset();
for (auto arg : it->get_arguments())
{
auto input_node = std::dynamic_pointer_cast<ngraph::op::Op>(arg);
auto input_tensor = &input_node->get_output_tensor();
auto old_offset = input_tensor->get_pool_offset();
input_tensor->set_pool_offset(offset);
NGRAPH_DEBUG << "cpu_external_function: change offset, old offset is "
<< old_offset << ", new offset is " << offset << std::endl;
offset += input_tensor->size();
if (auto arg_concat = std::dynamic_pointer_cast<ngraph::op::Concat>(arg))
{
stack.push_front(arg_concat);
}
}
}
}
}
}
void runtime::cpu::CPU_ExternalFunction::build()
{
if (m_is_built)
......@@ -1181,6 +1255,43 @@ void runtime::cpu::CPU_ExternalFunction::build()
}
// Build executor
// concat
for (shared_ptr<Node> node : m_function->get_ordered_ops())
{
if (auto concat = std::dynamic_pointer_cast<ngraph::op::Concat>(node))
{
if (auto op_annotations = concat->get_op_annotations())
{
auto in_place_oi_pairs = op_annotations->get_in_place_oi_pairs();
if (in_place_oi_pairs.size() > 0)
{
bool found_last_concat = true;
for (auto user : concat->get_users())
{
if (dynamic_pointer_cast<ngraph::op::Concat>(user))
{
found_last_concat = false;
break;
}
}
if (found_last_concat)
{
for (auto arg : concat->get_arguments())
{
if (auto arg_concat = dynamic_pointer_cast<ngraph::op::Concat>(arg))
{
NGRAPH_DEBUG << "call propagate_in_place_concat for "
<< arg->get_name() << std::endl;
propagate_in_place_concat(arg_concat);
}
}
}
}
}
}
}
// Intermediates
if (m_function->get_temporary_pool_size())
{
......
......@@ -36,6 +36,7 @@
#endif
#include "ngraph/function.hpp"
#include "ngraph/op/concat.hpp"
#include "ngraph/pass/manager.hpp"
#include "ngraph/runtime/cpu/cpu_call_frame.hpp"
#include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
......@@ -157,6 +158,8 @@ namespace ngraph
void propagate_in_place_output(ngraph::descriptor::Output* res_src_output,
std::string output_name,
bool dex);
// For a chain of concat ops, propagate pool offsets
void propagate_in_place_concat(std::shared_ptr<ngraph::op::Concat> concat);
bool computes_result(Node* node);
#if !defined(NGRAPH_DEX_ONLY)
......
//*****************************************************************************
// Copyright 2017-2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include "ngraph/runtime/cpu/pass/cpu_post_layout_assignment.hpp"
#include <typeindex>
#include <typeinfo>
#include "ngraph/descriptor/output.hpp"
#include "ngraph/op/concat.hpp"
#include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
using namespace std;
using namespace ngraph;
namespace ngraph
{
namespace runtime
{
namespace cpu
{
namespace pass
{
template <>
void CPUPostLayoutAssignment::ASSIGN_DECL(ngraph::op::Concat)
{
auto concat = static_cast<op::Concat*>(node);
auto shape = concat->get_input_shape(0);
auto axis = concat->get_concatenation_axis();
auto product = 1;
for (int i = 0; i < axis; i++)
{
product *= shape[i];
}
if (product != 1)
{
NGRAPH_DEBUG << "cpu_post_layout_assignment: The product of Concat's shape "
"before concat axis is not 1, no in place concat";
return;
}
bool in_place_concat = false;
for (descriptor::Input& input : concat->get_inputs())
{
if (shape_size(input.get_shape()) == 0)
{
NGRAPH_DEBUG << "cpu_post_layout_assignment: 0 length tensor, no in "
"place concat";
return;
}
const auto& output = input.get_output();
auto arg = output.get_node();
if (std::dynamic_pointer_cast<op::Constant>(arg) ||
std::dynamic_pointer_cast<op::Parameter>(arg))
{
NGRAPH_DEBUG << "cpu_post_layout_assignment: " << arg->get_name()
<< ": constant or parameter, no in place concat";
return;
}
else if (output.get_inputs().size() != 1)
{
// check if we can do in place concat
auto concat_count = 0;
for (auto input : output.get_inputs())
{
auto user = input->get_node();
if (std::dynamic_pointer_cast<op::Concat>(user))
{
concat_count++;
if (concat_count == 2)
{
NGRAPH_DEBUG << "cpu_post_layout_assignment: multiple "
"concat users, no in place concat";
return;
}
}
}
std::unordered_set<Node*> visited;
std::deque<Node*> stack;
stack.push_front(arg.get());
while (stack.size() > 0)
{
ngraph::Node* curr = stack.front();
visited.insert(curr);
if (curr->is_output())
{
NGRAPH_DEBUG << "cpu_post_layout_assignment: not post "
"dominated, no in place concat";
return;
}
else
{
if (auto op = dynamic_cast<op::Op*>(curr))
{
if (auto op_annotations = op->get_op_annotations())
{
for (auto oi_pair :
op_annotations->get_in_place_oi_pairs())
{
if (oi_pair.destructive)
{
NGRAPH_DEBUG << "cpu_post_layout_assignment: "
"destructive in place oi, no "
"in place concat";
return;
}
}
}
}
}
stack.pop_front();
if (curr != concat)
{
for (auto next : curr->get_users())
{
if (visited.count(next.get()) == 0)
{
stack.push_front(next.get());
}
}
}
}
in_place_concat = true;
}
else
{
in_place_concat = true;
}
}
if (in_place_concat)
{
auto op_annotations = concat->get_op_annotations();
if (op_annotations)
{
op_annotations->add_in_place_oi_pair({0, 0, false});
}
else
{
op_annotations =
std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
op_annotations->add_in_place_oi_pair({0, 0, false});
concat->set_op_annotations(op_annotations);
}
}
}
}
}
}
}
#define TI(x) type_index(typeid(x))
static const runtime::cpu::pass::PostLayoutAssignOpMap s_dispatcher{
{TI(ngraph::op::Concat),
&runtime::cpu::pass::CPUPostLayoutAssignment::assign<ngraph::op::Concat>},
};
bool runtime::cpu::pass::CPUPostLayoutAssignment::run_on_call_graph(
const std::list<std::shared_ptr<Node>>& nodes)
{
for (const auto& node : nodes)
{
auto& n = *node;
auto handler = s_dispatcher.find(TI(n));
if (handler != s_dispatcher.end())
{
handler->second(m_external_function, node.get());
}
}
return false;
}
//*****************************************************************************
// Copyright 2017-2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include "ngraph/pass/pass.hpp"
#include "ngraph/runtime/cpu/cpu_external_function.hpp"
#include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
#define ASSIGN_DECL(op_name) \
assign<op_name>(ngraph::runtime::cpu::CPU_ExternalFunction * external_function, \
ngraph::Node * node)
namespace ngraph
{
namespace runtime
{
namespace cpu
{
namespace pass
{
using PostLayoutAssignFunction =
std::function<void(CPU_ExternalFunction*, ngraph::Node*)>;
using PostLayoutAssignOpMap =
std::unordered_map<std::type_index, PostLayoutAssignFunction>;
class CPUPostLayoutAssignment : public ngraph::pass::CallGraphPass
{
public:
CPUPostLayoutAssignment(CPU_ExternalFunction* external_function)
: m_external_function(external_function)
{
}
virtual bool
run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes) override;
template <typename OP>
static void
assign(ngraph::runtime::cpu::CPU_ExternalFunction* external_function,
ngraph::Node* node)
{
throw std::runtime_error("Unimplemented op '" + node->description() +
"' in CPU post layout assignment");
}
private:
CPU_ExternalFunction* m_external_function;
};
}
}
}
}
......@@ -641,6 +641,70 @@ NGRAPH_TEST(${BACKEND_NAME}, concat_2d_tensor)
EXPECT_EQ((vector<float>{1, 2, 3}), read_vector<float>(result));
}
NGRAPH_TEST(${BACKEND_NAME}, concat_in_place_2d_tensor)
{
Shape shape{1, 1};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto B = make_shared<op::Parameter>(element::f32, shape);
auto add1 = make_shared<op::Add>(A, B);
auto C = make_shared<op::Parameter>(element::f32, shape);
auto D = make_shared<op::Parameter>(element::f32, shape);
auto add2 = make_shared<op::Add>(C, D);
auto subtract = make_shared<op::Subtract>(C, A);
Shape shape_r{3, 1};
auto f = make_shared<Function>(make_shared<op::Concat>(NodeVector{add1, add2, subtract}, 0),
op::ParameterVector{A, B, C, D});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
auto a = backend->create_tensor(element::f32, shape);
copy_data(a, vector<float>{1});
auto b = backend->create_tensor(element::f32, shape);
copy_data(b, vector<float>{2});
auto c = backend->create_tensor(element::f32, shape);
copy_data(c, vector<float>{3});
auto d = backend->create_tensor(element::f32, shape);
copy_data(d, vector<float>{4});
auto result = backend->create_tensor(element::f32, shape_r);
backend->call_with_validate(f, {result}, {a, b, c, d});
EXPECT_EQ((vector<float>{3, 7, 2}), read_vector<float>(result));
}
NGRAPH_TEST(${BACKEND_NAME}, concat_in_place_propagate_2d_tensor)
{
Shape shape{1, 1};
auto A = make_shared<op::Parameter>(element::f32, shape);
auto B = make_shared<op::Parameter>(element::f32, shape);
auto add1 = make_shared<op::Add>(A, B);
auto C = make_shared<op::Parameter>(element::f32, shape);
auto D = make_shared<op::Parameter>(element::f32, shape);
auto add2 = make_shared<op::Add>(C, D);
auto concat1 = make_shared<op::Concat>(NodeVector{add1, add2}, 0);
auto subtract = make_shared<op::Subtract>(C, A);
Shape shape_r{3, 1};
auto f = make_shared<Function>(make_shared<op::Concat>(NodeVector{concat1, subtract}, 0),
op::ParameterVector{A, B, C, D});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
auto a = backend->create_tensor(element::f32, shape);
copy_data(a, vector<float>{1});
auto b = backend->create_tensor(element::f32, shape);
copy_data(b, vector<float>{2});
auto c = backend->create_tensor(element::f32, shape);
copy_data(c, vector<float>{3});
auto d = backend->create_tensor(element::f32, shape);
copy_data(d, vector<float>{4});
auto result = backend->create_tensor(element::f32, shape_r);
backend->call_with_validate(f, {result}, {a, b, c, d});
EXPECT_EQ((vector<float>{3, 7, 2}), read_vector<float>(result));
}
// from numpy import *
// a=linspace(1,2*3*4*3*2,2*3*4*3*2)
// b=linspace(1000+1,1000+2*3*3*3*2,2*3*3*3*2)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment