Commit e765956a authored by Jayaram Bobba's avatar Jayaram Bobba Committed by Scott Cyphers

IAT: Collapse dimensions around arithmetic reduction operations (#1763)

* Collapse dimensions for arithmetic reduction ops to support faster kernels

* Propagate in-place constants and allow in-place reshapes for more cases

* style fix

* Additional checks for parameter and constant to help backends that dont propagate in-place parameter and constant inputs

* Allow non-destructive pass through onlyu if memory sharing is disabled

* Address PR feedback

* Bug fix for collapse dimensions in case of null reduction
parent 1beec46b
...@@ -51,8 +51,10 @@ bool pass::MemoryLayout::run_on_function(shared_ptr<ngraph::Function> function) ...@@ -51,8 +51,10 @@ bool pass::MemoryLayout::run_on_function(shared_ptr<ngraph::Function> function)
auto input = &node->get_inputs().at(oi_pair.input).get_tensor(); auto input = &node->get_inputs().at(oi_pair.input).get_tensor();
auto input_node = node->get_inputs().at(oi_pair.input).get_output().get_node(); auto input_node = node->get_inputs().at(oi_pair.input).get_output().get_node();
// an input tensor can be reused if this is the last use // For destructive kernel, this should be the last use
if (node->liveness_free_list.count(input) != 0 && // Non-destructive kernels can pass through if memory sharing is disabled
if ((node->liveness_free_list.count(input) != 0 ||
(m_disable_memory_sharing && !oi_pair.destructive)) &&
node->liveness_new_list.count(output) != 0) node->liveness_new_list.count(output) != 0)
{ {
in_place_outputs.insert({output, input}); in_place_outputs.insert({output, input});
......
...@@ -551,7 +551,7 @@ using namespace ngraph::runtime; ...@@ -551,7 +551,7 @@ using namespace ngraph::runtime;
{ {
for (shared_ptr<Node> node : function_ordered_ops.at(current_function)) for (shared_ptr<Node> node : function_ordered_ops.at(current_function))
{ {
const ngraph::op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get()); ngraph::op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
if (c) if (c)
{ {
m_active_constants.push_back(node); m_active_constants.push_back(node);
...@@ -677,6 +677,15 @@ using namespace ngraph::runtime; ...@@ -677,6 +677,15 @@ using namespace ngraph::runtime;
"(*(ctx->G), [&](const tbb::flow::continue_msg &msg)\n{});\n"; "(*(ctx->G), [&](const tbb::flow::continue_msg &msg)\n{});\n";
} }
for (shared_ptr<Node> node : ordered_ops)
{
if (dynamic_cast<ngraph::op::Constant*>(node.get()))
{
shared_ptr<descriptor::Tensor> tv = node->get_outputs()[0].get_tensor_ptr();
propagate_in_place_constant(&node->get_outputs().at(0), tv->get_name(), false);
}
}
// Add inputs to the variable name map // Add inputs to the variable name map
size_t arg_index = 0; size_t arg_index = 0;
for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters()) for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
...@@ -1102,6 +1111,53 @@ void runtime::cpu::CPU_ExternalFunction::propagate_in_place_input( ...@@ -1102,6 +1111,53 @@ void runtime::cpu::CPU_ExternalFunction::propagate_in_place_input(
} }
} }
void runtime::cpu::CPU_ExternalFunction::propagate_in_place_constant(
ngraph::descriptor::Output* output, std::string input_name, bool dex)
{
std::deque<ngraph::descriptor::Output*> stack;
stack.push_front(output);
while (stack.size() > 0)
{
ngraph::descriptor::Output* it = stack.front();
stack.pop_front();
for (auto input : it->get_inputs())
{
auto c_op = std::dynamic_pointer_cast<ngraph::op::Op>(input->get_node());
if (!c_op || c_op->is_output())
{
continue;
}
if (auto op_annotations = c_op->get_op_annotations())
{
for (auto oi_pair : op_annotations->get_in_place_oi_pairs())
{
if (oi_pair.input == input->get_index() && !oi_pair.destructive)
{
size_t output_index = oi_pair.output;
auto& output_tensor = c_op->get_outputs().at(output_index).get_tensor();
if (dex)
{
tensor_alias[output_tensor.get_name()] = input_name;
}
else
{
m_variable_name_map[output_tensor.get_name()] = input_name;
}
m_tensor_roles[output_tensor.get_name()] = CPUTensorRole::CONSTANT;
NGRAPH_DEBUG << " CPU: Forwarding " << input_name << " through "
<< output_tensor.get_name();
stack.push_back(&c_op->get_outputs().at(output_index));
}
}
}
}
}
}
void runtime::cpu::CPU_ExternalFunction::propagate_in_place_output( void runtime::cpu::CPU_ExternalFunction::propagate_in_place_output(
ngraph::descriptor::Output* res_src_output, std::string output_name, bool dex) ngraph::descriptor::Output* res_src_output, std::string output_name, bool dex)
{ {
...@@ -1239,6 +1295,7 @@ void runtime::cpu::CPU_ExternalFunction::build() ...@@ -1239,6 +1295,7 @@ void runtime::cpu::CPU_ExternalFunction::build()
tensor_data[tv->get_name()] = tensor_data[tv->get_name()] =
const_cast<void*>(static_pointer_cast<ngraph::op::Constant>(node)->get_data_ptr()); const_cast<void*>(static_pointer_cast<ngraph::op::Constant>(node)->get_data_ptr());
m_tensor_roles[tv->get_name()] = CPUTensorRole::CONSTANT; m_tensor_roles[tv->get_name()] = CPUTensorRole::CONSTANT;
propagate_in_place_constant(&node->get_outputs().at(0), tv->get_name(), true);
} }
} }
......
...@@ -174,6 +174,11 @@ namespace ngraph ...@@ -174,6 +174,11 @@ namespace ngraph
// Register passes that are common to codegen and DEX // Register passes that are common to codegen and DEX
void register_common_passes(ngraph::pass::Manager& pass_manager); void register_common_passes(ngraph::pass::Manager& pass_manager);
// For non-destructive passthrough kernels, propagate function
// constant buffers to internal ops
void propagate_in_place_constant(ngraph::descriptor::Output* output,
std::string input_name,
bool dex);
// For non-destructive passthrough kernels, propagate function // For non-destructive passthrough kernels, propagate function
// input buffers to internal ops // input buffers to internal ops
void propagate_in_place_input(ngraph::descriptor::Output* output, void propagate_in_place_input(ngraph::descriptor::Output* output,
......
...@@ -478,3 +478,37 @@ TEST(cpu_test, reshape_layout_optimizations7) ...@@ -478,3 +478,37 @@ TEST(cpu_test, reshape_layout_optimizations7)
} }
EXPECT_EQ(count_ops_of_type<runtime::cpu::op::ConvertLayout>(cpu_f), 0); EXPECT_EQ(count_ops_of_type<runtime::cpu::op::ConvertLayout>(cpu_f), 0);
} }
TEST(cpu_test, collapse_dims1)
{
// Expand multiple dimensions. Ensure no extra conversions downstream
auto make_function = []() -> std::shared_ptr<Function> {
auto A = make_shared<op::Parameter>(element::f32, Shape{1, 4, 10, 6, 10});
auto sum1 = make_shared<op::Sum>(A, AxisVector{1}); // Shape{1, 10, 6, 10}
auto sum2 = make_shared<op::Sum>(sum1, AxisVector{0}); // Shape{10, 6, 10}
return make_shared<Function>(NodeVector{sum2}, op::ParameterVector{A});
};
auto backend = runtime::Backend::create("CPU");
auto cpu_f = make_function();
auto int_f = make_function();
test::Uniform<float> rng(-100.0f, 100.0f);
vector<vector<float>> args;
for (shared_ptr<op::Parameter> param : cpu_f->get_parameters())
{
vector<float> tensor_val(shape_size(param->get_shape()));
rng.initialize(tensor_val);
args.push_back(tensor_val);
}
auto int_results = execute(int_f, args, "INTERPRETER");
auto cpu_results = execute(cpu_f, args, "CPU");
for (size_t i = 0; i < cpu_results.size(); i++)
{
EXPECT_TRUE(test::all_close(cpu_results.at(i), int_results.at(i)));
}
// sum1 will have two reshapes added around it. sum2 will be replaced
// with a reshape
EXPECT_EQ(count_ops_of_type<op::Reshape>(cpu_f), 3);
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment