Commit 65355a17 authored by Amy Zhuang's avatar Amy Zhuang Committed by Robert Kimball

Add in-place-slice optimization for CPU backend. (#1967)

* Add in-place-slice optimization for CPU backend.

* Modify slice emitter function for in place slice.

* Allow arg node to have multiple outputs for in place slice.

* Remove unused variable.

* Add CPUExecutionContext argument to slice builder.

* Address PR feedback: move computation out of the functor.

* Move size computation out of the functor for in place concat.
parent dfc20454
......@@ -20,6 +20,7 @@
#include "ngraph/log.hpp"
#include "ngraph/log.hpp"
#include "ngraph/op/concat.hpp"
#include "ngraph/op/slice.hpp"
#include "ngraph/pass/liveness.hpp"
#include "ngraph/pass/manager.hpp"
#include "ngraph/pass/memory_layout.hpp"
......@@ -44,8 +45,9 @@ bool pass::MemoryLayout::run_on_function(shared_ptr<ngraph::Function> function)
if (auto op = std::dynamic_pointer_cast<op::Op>(node))
{
// concat in_place_oi should be treated differently
if (!std::dynamic_pointer_cast<op::Concat>(node))
// concat and slice in_place_oi should be treated differently
if (!std::dynamic_pointer_cast<op::Concat>(node) &&
!std::dynamic_pointer_cast<op::Slice>(node))
{
if (auto op_annotations = op->get_op_annotations())
{
......
......@@ -39,6 +39,8 @@ namespace ngraph
vector<reference_wrapper<void*>> arg_tensors;
vector<Shape> arg_shapes;
vector<size_t> arg_sizes;
auto element_size = concat->get_input_element_type(0).size();
for (auto& arg : args)
{
if (shape_size(arg.get_shape()))
......@@ -46,6 +48,7 @@ namespace ngraph
arg_tensors.emplace_back(
external_function->get_tensor_data(arg.get_name()));
arg_shapes.emplace_back(arg.get_shape());
arg_sizes.emplace_back(shape_size(arg.get_shape()) * element_size);
}
}
auto nargs = args.size();
......@@ -53,19 +56,18 @@ namespace ngraph
auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
auto out_shape = out[0].get_shape();
auto element_size = concat->get_input_element_type(0).size();
if (auto op_annotations = concat->get_op_annotations())
{
auto in_place_oi_pairs = op_annotations->get_in_place_oi_pairs();
if (in_place_oi_pairs.size() > 0)
{
auto functor = [&, arg_tensors, nargs, out_shape, arg_shapes, element_size](
auto out_size = shape_size(out_shape) * element_size;
auto functor = [&, arg_tensors, nargs, out_size, arg_sizes](
CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
auto out_size = shape_size(out_shape) * element_size;
auto offset = 0;
for (size_t i = 0; i < nargs; i++)
{
auto arg_size = shape_size(arg_shapes[i]) * element_size;
// if the argument pointer does not fall within the concat output buffer
// (caused by propagate_in_place_output or propagate_in_place_input), we need to copy the data;
// otherwise, we can skip the copy.
......@@ -75,9 +77,9 @@ namespace ngraph
{
memcpy(reinterpret_cast<char*>(out_tensor) + offset,
arg_tensors[i],
arg_size);
arg_sizes[i]);
}
offset += arg_size;
offset += arg_sizes[i];
}
};
......
......@@ -48,6 +48,37 @@ namespace ngraph
auto lower_bounds = slice->get_lower_bounds();
auto upper_bounds = slice->get_upper_bounds();
if (auto op_annotations = slice->get_op_annotations())
{
auto in_place_oi_pairs = op_annotations->get_in_place_oi_pairs();
if (in_place_oi_pairs.size() > 0)
{
auto element_size = slice->get_input_element_type(0).size();
auto start = 0, accumulated = 1;
for (int i = arg_shape.size() - 1; i >= 0; i--)
{
start += lower_bounds[i] * accumulated;
accumulated *= arg_shape[i];
}
auto out_size = shape_size(out_shape) * element_size;
auto arg_size = shape_size(arg_shape) * element_size;
auto offset = start * element_size;
auto functor = [&, out_size, arg_size, offset](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) {
if (out_tensor < arg_tensor ||
out_tensor >= reinterpret_cast<char*>(arg_tensor) + arg_size)
{
memcpy(out_tensor,
reinterpret_cast<char*>(arg_tensor) + offset,
out_size);
}
};
functors.emplace_back(functor);
return;
}
}
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
......
......@@ -2010,6 +2010,32 @@ namespace ngraph
{
const ngraph::op::Slice* slice = static_cast<const ngraph::op::Slice*>(node);
if (auto op_annotations = slice->get_op_annotations())
{
auto in_place_oi_pairs = op_annotations->get_in_place_oi_pairs();
if (in_place_oi_pairs.size() > 0)
{
auto arg_shape = args[0].get_shape();
auto lower_bounds = slice->get_lower_bounds();
auto start = 0, accumulated = 1;
for (int i = arg_shape.size() - 1; i >= 0; i--)
{
start += lower_bounds[i] * accumulated;
accumulated *= arg_shape[i];
}
writer << "if (" << out[0].get_name() << " < " << args[0].get_name()
<< " || " << out[0].get_name() << " >= " << args[0].get_name()
<< " + " << args[0].get_size() * out[0].get_element_type().size()
<< ")\n";
writer.block_begin();
writer << "memcpy(" << out[0].get_name() << ", " << args[0].get_name()
<< " + " << start * out[0].get_element_type().size() << ", "
<< out[0].get_size() * out[0].get_element_type().size() << ");\n";
writer.block_end();
return;
}
}
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
auto out_shape = out[0].get_shape();
......
......@@ -634,6 +634,9 @@ using namespace ngraph::runtime;
}
}
// In place slice optimization
process_in_place_slice(ordered_ops);
// In place concatenation optimization
process_in_place_concat(ordered_ops);
......@@ -1085,7 +1088,7 @@ void runtime::cpu::CPU_ExternalFunction::propagate_in_place_input(
for (auto input : it->get_inputs())
{
auto c_op = std::dynamic_pointer_cast<ngraph::op::Op>(input->get_node());
if (!c_op || c_op->is_output())
if (!c_op || c_op->is_output() || dynamic_pointer_cast<ngraph::op::Slice>(c_op))
{
continue;
}
......@@ -1179,7 +1182,7 @@ void runtime::cpu::CPU_ExternalFunction::propagate_in_place_output(
{
propagate_further = false;
auto arg = std::dynamic_pointer_cast<ngraph::op::Op>(it->get_node());
if (!arg)
if (!arg || std::dynamic_pointer_cast<ngraph::op::Slice>(it->get_node()))
{
break;
}
......@@ -1316,6 +1319,46 @@ void runtime::cpu::CPU_ExternalFunction::propagate_in_place_concat(
}
}
//slice
void runtime::cpu::CPU_ExternalFunction::process_in_place_slice(
std::list<std::shared_ptr<Node>> nodes)
{
for (shared_ptr<Node>& node : nodes)
{
if (auto slice = std::dynamic_pointer_cast<ngraph::op::Slice>(node))
{
if (auto op_annotations = slice->get_op_annotations())
{
auto in_place_oi_pairs = op_annotations->get_in_place_oi_pairs();
if (in_place_oi_pairs.size() > 0)
{
auto arg = slice->get_argument(0);
auto input = slice->get_input_from(arg);
auto index = input->get_output().get_index();
auto input_node = std::dynamic_pointer_cast<ngraph::op::Op>(arg);
auto input_tensor = &input_node->get_output_tensor(index);
auto offset = input_tensor->get_pool_offset();
auto lower_bounds = slice->get_lower_bounds();
auto start = 0, accumulated = 1;
auto in_shape = slice->get_input_shape(0);
for (int i = in_shape.size() - 1; i >= 0; i--)
{
start += lower_bounds[i] * accumulated;
accumulated *= in_shape[i];
}
offset += node->get_element_type().size() * start;
auto output_tensor = &slice->get_output_tensor();
auto old_offset = output_tensor->get_pool_offset();
output_tensor->set_pool_offset(offset);
NGRAPH_DEBUG << "cpu_external_function: change offset, old offset is "
<< old_offset << ", new offset is " << offset << std::endl;
}
}
}
}
}
void runtime::cpu::CPU_ExternalFunction::build()
{
if (m_is_built)
......@@ -1378,6 +1421,9 @@ void runtime::cpu::CPU_ExternalFunction::build()
// Build executor
// In place slice optimization
process_in_place_slice(m_function->get_ordered_ops());
// In place concatenation optimization
process_in_place_concat(m_function->get_ordered_ops());
......
......@@ -205,6 +205,10 @@ namespace ngraph
// For a chain of concat ops, propagate memory pool offsets
void propagate_in_place_concat(std::shared_ptr<ngraph::op::Concat> concat);
// Find in-place slice ops and set appropriate memory pool offset for its output
void process_in_place_slice(std::list<std::shared_ptr<Node>> nodes);
bool computes_result(Node* node);
void release_function() { m_function = nullptr; }
#if !defined(NGRAPH_DEX_ONLY)
......
......@@ -52,6 +52,7 @@
#include "ngraph/graph_util.hpp"
#include "ngraph/op/concat.hpp"
#include "ngraph/op/constant.hpp"
#include "ngraph/op/slice.hpp"
#include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
......@@ -219,5 +220,87 @@ bool runtime::cpu::pass::CPUMemoryOptimization::run_on_function(std::shared_ptr<
}
}
}
for (auto n : function->get_ordered_ops())
{
if (auto slice = std::dynamic_pointer_cast<op::Slice>(n))
{
auto in_shape = slice->get_input_shape(0);
auto out_shape = slice->get_output_shape(0);
auto strides = slice->get_strides();
auto lower_bounds = slice->get_lower_bounds();
auto upper_bounds = slice->get_upper_bounds();
auto arg = slice->get_argument(0);
if (std::dynamic_pointer_cast<op::Constant>(arg) ||
std::dynamic_pointer_cast<op::Parameter>(arg))
{
NGRAPH_DEBUG << "cpu_memory_optimization: " << arg->get_name()
<< ": constant or parameter, no in place slice";
continue;
}
if (is_strided(strides))
{
NGRAPH_DEBUG << "cpu_memory_optimization: strided slice, no in place slice";
continue;
}
auto product = 1;
int axis = in_shape.size() - 1;
for (int i = in_shape.size() - 1; i >= 0; i--)
{
if (in_shape[i] != out_shape[i])
{
axis = i;
break;
}
}
for (int i = 0; i < axis; i++)
{
product *= in_shape[i];
}
if (product != 1)
{
NGRAPH_DEBUG << "cpu_memory_optimization: The product of input shape "
"before slice axis is not 1, no in place slice";
continue;
}
// check if input and output formats are the same
auto output_md = mkldnn_utils::get_output_mkldnn_md(n.get(), 0);
auto output_format = static_cast<mkldnn::memory::format>(output_md.data.format);
auto input_md = mkldnn_utils::get_input_mkldnn_md(n.get(), 0);
auto input_format = static_cast<mkldnn::memory::format>(input_md.data.format);
if (output_format != input_format)
{
NGRAPH_DEBUG << "cpu_memory_optimization: input format is different from "
"output format, no in place slice";
continue;
}
// check if input layout is padded
AxisVector axis_list = ngraph::get_default_order(in_shape);
if (mkldnn_utils::is_mkldnn_padded_layout(input_md, axis_list))
{
NGRAPH_DEBUG << "cpu_memory_optimization: padded input layout, no in place slice";
continue;
}
auto op_annotations = slice->get_op_annotations();
if (op_annotations)
{
op_annotations->add_in_place_oi_pair({0, 0, false});
}
else
{
op_annotations = std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
op_annotations->add_in_place_oi_pair({0, 0, false});
slice->set_op_annotations(op_annotations);
}
}
}
return false;
}
......@@ -1138,6 +1138,31 @@ NGRAPH_TEST(${BACKEND_NAME}, slice_vector)
EXPECT_EQ((vector<float>{2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}), read_vector<float>(result));
}
NGRAPH_TEST(${BACKEND_NAME}, slice_matrix_axis_0_overlap)
{
Shape shape_a{4, 4};
auto A = make_shared<op::Parameter>(element::f32, shape_a);
auto B = make_shared<op::Parameter>(element::f32, shape_a);
auto C = make_shared<op::Add>(A, B);
Shape shape_r{2, 4};
auto D = make_shared<op::Slice>(C, Coordinate{0, 0}, Coordinate{2, 4});
auto E = make_shared<op::Slice>(C, Coordinate{1, 0}, Coordinate{3, 4});
auto r = make_shared<op::Add>(D, E);
auto f = make_shared<Function>(r, op::ParameterVector{A, B});
auto backend = runtime::Backend::create("${BACKEND_NAME}");
// Create some tensors for input/output
auto a = backend->create_tensor(element::f32, shape_a);
copy_data(a, vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
auto b = backend->create_tensor(element::f32, shape_a);
copy_data(b, vector<float>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
auto result = backend->create_tensor(element::f32, shape_r);
backend->call_with_validate(f, {result}, {a, b});
EXPECT_EQ((vector<float>{12, 16, 20, 24, 28, 32, 36, 40}), read_vector<float>(result));
}
NGRAPH_TEST(${BACKEND_NAME}, slice_matrix_strided)
{
Shape shape_a{4, 4};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment