Commit 626536ad authored by Nick Korovaiko's avatar Nick Korovaiko Committed by Robert Kimball

Simplify result copy elimination (#1345)

* simplify result copy elimination

* gpu fix

* remove include header

* circumvent gpu issue

* add a whitepace
parent 09242c31
...@@ -128,7 +128,6 @@ set (SRC ...@@ -128,7 +128,6 @@ set (SRC
pass/nop_elimination.cpp pass/nop_elimination.cpp
pass/pass.cpp pass/pass.cpp
pass/reshape_elimination.cpp pass/reshape_elimination.cpp
pass/result_copy_elimination.cpp
pass/zero_dim_tensor_elimination.cpp pass/zero_dim_tensor_elimination.cpp
pass/validate_graph.cpp pass/validate_graph.cpp
pass/visualize_tree.cpp pass/visualize_tree.cpp
......
...@@ -47,7 +47,6 @@ shared_ptr<Node> op::Result::copy_with_new_args(const NodeVector& new_args) cons ...@@ -47,7 +47,6 @@ shared_ptr<Node> op::Result::copy_with_new_args(const NodeVector& new_args) cons
auto res = make_shared<Result>(new_args.at(0)); auto res = make_shared<Result>(new_args.at(0));
if (res) if (res)
{ {
res->set_needs_copy(m_needs_copy);
res->set_needs_default_layout(m_needs_default_layout); res->set_needs_default_layout(m_needs_default_layout);
} }
return res; return res;
......
...@@ -38,8 +38,6 @@ namespace ngraph ...@@ -38,8 +38,6 @@ namespace ngraph
copy_with_new_args(const NodeVector& new_args) const override; copy_with_new_args(const NodeVector& new_args) const override;
virtual bool is_output() const override { return true; } virtual bool is_output() const override { return true; }
void set_needs_copy(bool val) { m_needs_copy = val; }
bool needs_copy() const { return m_needs_copy; }
void set_needs_default_layout(bool val) { m_needs_default_layout = val; } void set_needs_default_layout(bool val) { m_needs_default_layout = val; }
bool needs_default_layout() const { return m_needs_default_layout; } bool needs_default_layout() const { return m_needs_default_layout; }
protected: protected:
...@@ -47,7 +45,6 @@ namespace ngraph ...@@ -47,7 +45,6 @@ namespace ngraph
const NodeVector& deltas) override; const NodeVector& deltas) override;
private: private:
bool m_needs_copy{true};
bool m_needs_default_layout{false}; bool m_needs_default_layout{false};
}; };
} }
......
//*****************************************************************************
// Copyright 2017-2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include "result_copy_elimination.hpp"
#include "ngraph/node.hpp"
#include "ngraph/op/parameter.hpp"
#include "ngraph/op/result.hpp"
#include "ngraph/util.hpp"
bool ngraph::pass::ResultCopyElimination::run_on_function(std::shared_ptr<ngraph::Function> f)
{
std::set<std::shared_ptr<Node>> seen;
for (auto res : f->get_results())
{
auto arg = res->get_argument(0);
// we need a copy
if (arg->is_parameter() || arg->is_constant())
{
continue;
}
// TODO: check if broadcast replace op::Result w/ a copy of broadcast node
// TODO: consider other cases where it's easier to recompute than make a copy
// we will compute the result directly into output[]
if (seen.count(arg) == 0)
{
res->set_needs_copy(false);
seen.insert(arg);
}
}
return true;
}
//*****************************************************************************
// Copyright 2017-2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include "ngraph/pass/pass.hpp"
namespace ngraph
{
namespace pass
{
class ResultCopyElimination;
}
}
class ngraph::pass::ResultCopyElimination : public ngraph::pass::FunctionPass
{
public:
ResultCopyElimination()
: FunctionPass()
{
}
virtual bool run_on_function(std::shared_ptr<ngraph::Function> f) override;
};
...@@ -4231,10 +4231,9 @@ namespace ngraph ...@@ -4231,10 +4231,9 @@ namespace ngraph
template <> template <>
void CPU_Emitter::EMITTER_DECL(ngraph::op::Result) void CPU_Emitter::EMITTER_DECL(ngraph::op::Result)
{ {
const ngraph::op::Result* result = static_cast<const ngraph::op::Result*>(node); if (args[0].get_name() == out[0].get_name())
if (!result->needs_copy())
{ {
writer << "// Skipping generation for " << node->get_name() << "\n";
return; return;
} }
......
...@@ -126,7 +126,6 @@ ...@@ -126,7 +126,6 @@
#include "ngraph/pass/manager.hpp" #include "ngraph/pass/manager.hpp"
#include "ngraph/pass/memory_layout.hpp" #include "ngraph/pass/memory_layout.hpp"
#include "ngraph/pass/nop_elimination.hpp" #include "ngraph/pass/nop_elimination.hpp"
#include "ngraph/pass/result_copy_elimination.hpp"
#include "ngraph/runtime/aligned_buffer.hpp" #include "ngraph/runtime/aligned_buffer.hpp"
#include "ngraph/runtime/cpu/cpu_backend.hpp" #include "ngraph/runtime/cpu/cpu_backend.hpp"
#include "ngraph/runtime/cpu/cpu_builder.hpp" #include "ngraph/runtime/cpu/cpu_builder.hpp"
...@@ -389,7 +388,6 @@ void runtime::cpu::CPU_ExternalFunction::compile() ...@@ -389,7 +388,6 @@ void runtime::cpu::CPU_ExternalFunction::compile()
pass_manager.register_pass<runtime::cpu::pass::CPUAssignment>(this); pass_manager.register_pass<runtime::cpu::pass::CPUAssignment>(this);
pass_manager.register_pass<runtime::cpu::pass::CPULayout>(this); pass_manager.register_pass<runtime::cpu::pass::CPULayout>(this);
pass_manager.register_pass<runtime::cpu::pass::CPUPostLayoutOptimizations>(); pass_manager.register_pass<runtime::cpu::pass::CPUPostLayoutOptimizations>();
pass_manager.register_pass<ngraph::pass::ResultCopyElimination>();
pass_manager.register_pass<ngraph::pass::GetOutputElementElimination>(); pass_manager.register_pass<ngraph::pass::GetOutputElementElimination>();
unordered_map<Node*, Node*> node_function_map; unordered_map<Node*, Node*> node_function_map;
string common_function_string; string common_function_string;
...@@ -694,14 +692,15 @@ using namespace ngraph::runtime; ...@@ -694,14 +692,15 @@ using namespace ngraph::runtime;
m_variable_name_map[tv->get_tensor().get_name()] = ss.str(); m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
m_tensor_roles[tv->get_tensor().get_name()] = CPUTensorRole::OUTPUT; m_tensor_roles[tv->get_tensor().get_name()] = CPUTensorRole::OUTPUT;
// it should be safe to assign both descriptors to one output* //keep assigning different outputs to a result descriptor
// since needs_copy == false makes `op::Result` an nop //op::Result emitter will check if in and out descriptors are the same
//and skip a copy
auto res = std::dynamic_pointer_cast<ngraph::op::Result>(op); auto res = std::dynamic_pointer_cast<ngraph::op::Result>(op);
if (!res->needs_copy()) auto input_node = res->get_inputs().at(0).get_output().get_node();
if (!input_node->is_constant() && !input_node->is_parameter())
{ {
shared_ptr<descriptor::TensorView> itv = shared_ptr<descriptor::TensorView> itv =
res->get_inputs().at(0).get_output().get_tensor_view(); res->get_inputs().at(0).get_output().get_tensor_view();
auto output_name = ss.str(); auto output_name = ss.str();
m_variable_name_map[itv->get_tensor().get_name()] = ss.str(); m_variable_name_map[itv->get_tensor().get_name()] = ss.str();
m_tensor_roles[itv->get_tensor().get_name()] = CPUTensorRole::OUTPUT; m_tensor_roles[itv->get_tensor().get_name()] = CPUTensorRole::OUTPUT;
...@@ -1142,7 +1141,6 @@ void runtime::cpu::CPU_ExternalFunction::build() ...@@ -1142,7 +1141,6 @@ void runtime::cpu::CPU_ExternalFunction::build()
pass_manager.register_pass<runtime::cpu::pass::CPUAssignment>(this); pass_manager.register_pass<runtime::cpu::pass::CPUAssignment>(this);
pass_manager.register_pass<runtime::cpu::pass::CPULayout>(this); pass_manager.register_pass<runtime::cpu::pass::CPULayout>(this);
pass_manager.register_pass<runtime::cpu::pass::CPUPostLayoutOptimizations>(); pass_manager.register_pass<runtime::cpu::pass::CPUPostLayoutOptimizations>();
pass_manager.register_pass<ngraph::pass::ResultCopyElimination>();
pass_manager.register_pass<ngraph::pass::GetOutputElementElimination>(); pass_manager.register_pass<ngraph::pass::GetOutputElementElimination>();
pass_manager.register_pass<ngraph::pass::Liveness>(); pass_manager.register_pass<ngraph::pass::Liveness>();
pass_manager.register_pass<ngraph::pass::MemoryLayout>(size_t(s_memory_pool_alignment), true); pass_manager.register_pass<ngraph::pass::MemoryLayout>(size_t(s_memory_pool_alignment), true);
...@@ -1238,8 +1236,12 @@ void runtime::cpu::CPU_ExternalFunction::build() ...@@ -1238,8 +1236,12 @@ void runtime::cpu::CPU_ExternalFunction::build()
function_output_index.emplace_back(tensor_data[tv->get_tensor().get_name()], i); function_output_index.emplace_back(tensor_data[tv->get_tensor().get_name()], i);
m_tensor_roles[tv->get_tensor().get_name()] = CPUTensorRole::OUTPUT; m_tensor_roles[tv->get_tensor().get_name()] = CPUTensorRole::OUTPUT;
//keep assigning different outputs to a result descriptor
//op::Result emitter will check if in and out descriptors are the same
//and skip a copy
auto res = std::dynamic_pointer_cast<ngraph::op::Result>(op); auto res = std::dynamic_pointer_cast<ngraph::op::Result>(op);
if (!res->needs_copy()) auto input_node = res->get_inputs().at(0).get_output().get_node();
if (!input_node->is_constant() && !input_node->is_parameter())
{ {
shared_ptr<descriptor::TensorView> itv = shared_ptr<descriptor::TensorView> itv =
res->get_inputs().at(0).get_output().get_tensor_view(); res->get_inputs().at(0).get_output().get_tensor_view();
...@@ -1276,6 +1278,7 @@ void runtime::cpu::CPU_ExternalFunction::build() ...@@ -1276,6 +1278,7 @@ void runtime::cpu::CPU_ExternalFunction::build()
} }
vector<TensorViewWrapper> out; vector<TensorViewWrapper> out;
vector<string> out_names; vector<string> out_names;
for (const descriptor::Output& output : node->get_outputs()) for (const descriptor::Output& output : node->get_outputs())
{ {
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view(); shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
......
...@@ -788,6 +788,12 @@ namespace ngraph ...@@ -788,6 +788,12 @@ namespace ngraph
template <> template <>
void GPU_Emitter::EMITTER_DECL(ngraph::op::Result) void GPU_Emitter::EMITTER_DECL(ngraph::op::Result)
{ {
if (args[0].get_name() == out[0].get_name())
{
writer << "// Skipping generation for " << node->get_name() << "\n";
return;
}
writer.block_begin(); writer.block_begin();
kernel::emit_memcpyDtD(writer, out[0], args[0]); kernel::emit_memcpyDtD(writer, out[0], args[0]);
writer.block_end(); writer.block_end();
......
...@@ -533,10 +533,12 @@ void runtime::gpu::GPU_ExternalFunction::emit_functions() ...@@ -533,10 +533,12 @@ void runtime::gpu::GPU_ExternalFunction::emit_functions()
ss << "((" << type << "*)(outputs[" << i << "]))"; ss << "((" << type << "*)(outputs[" << i << "]))";
m_variable_name_map[tv->get_tensor().get_name()] = ss.str(); m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
// it should be safe to assign both descriptors to one output*
// since needs_copy == false makes `op::Result` an nop
auto res = dynamic_pointer_cast<ngraph::op::Result>(op); auto res = dynamic_pointer_cast<ngraph::op::Result>(op);
if (!res->needs_copy()) //keep assigning different outputs to a result descriptor
//op::Result emitter will check if in and out descriptors are the same
//and skip a copy
auto input_node = res->get_inputs().at(0).get_output().get_node();
if (!input_node->is_constant() && !input_node->is_parameter())
{ {
shared_ptr<descriptor::TensorView> itv = shared_ptr<descriptor::TensorView> itv =
res->get_inputs().at(0).get_output().get_tensor_view(); res->get_inputs().at(0).get_output().get_tensor_view();
...@@ -644,8 +646,6 @@ void runtime::gpu::GPU_ExternalFunction::compile() ...@@ -644,8 +646,6 @@ void runtime::gpu::GPU_ExternalFunction::compile()
m_shared_context->m_primitive_emitter->get_memory_allocator()); m_shared_context->m_primitive_emitter->get_memory_allocator());
m_pass_manager.register_pass<ngraph::pass::LikeReplacement>(); m_pass_manager.register_pass<ngraph::pass::LikeReplacement>();
m_pass_manager.register_pass<ngraph::pass::ResultCopyElimination>();
m_pass_manager m_pass_manager
.register_pass<ngraph::pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>(); .register_pass<ngraph::pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
......
...@@ -31,7 +31,6 @@ ...@@ -31,7 +31,6 @@
#include "ngraph/pass/liveness.hpp" #include "ngraph/pass/liveness.hpp"
#include "ngraph/pass/manager.hpp" #include "ngraph/pass/manager.hpp"
#include "ngraph/pass/memory_layout.hpp" #include "ngraph/pass/memory_layout.hpp"
#include "ngraph/pass/result_copy_elimination.hpp"
#include "ngraph/runtime/gpu/gpu_backend.hpp" #include "ngraph/runtime/gpu/gpu_backend.hpp"
#include "ngraph/runtime/gpu/gpu_call_frame.hpp" #include "ngraph/runtime/gpu/gpu_call_frame.hpp"
#include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp" #include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
......
...@@ -1128,7 +1128,7 @@ NGRAPH_TEST(${BACKEND_NAME}, backwards_select_nested) ...@@ -1128,7 +1128,7 @@ NGRAPH_TEST(${BACKEND_NAME}, backwards_select_nested)
auto X0 = make_shared<op::Parameter>(element::boolean, shape); auto X0 = make_shared<op::Parameter>(element::boolean, shape);
auto X1 = make_shared<op::Parameter>(element::f32, shape); auto X1 = make_shared<op::Parameter>(element::f32, shape);
auto X2 = make_shared<op::Parameter>(element::f32, shape); auto X2 = make_shared<op::Parameter>(element::f32, shape);
return make_shared<Function>(make_shared<op::Select>(X0, X1 + X2, X2 - X1), return make_shared<Function>(make_shared<op::Select>(X0, X2 + X1, X2 - X1),
std::vector<std::shared_ptr<op::Parameter>>{X0, X1, X2}); std::vector<std::shared_ptr<op::Parameter>>{X0, X1, X2});
}; };
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment