Simplify result copy elimination (#1345)

* simplify result copy elimination * gpu fix * remove include header * circumvent gpu issue * add a whitepace

Simplify result copy elimination (#1345)
* simplify result copy elimination * gpu fix * remove include header * circumvent gpu issue * add a whitepace
626536ad · Nick Korovaiko · Robert Kimball · 09242c31 · 626536ad · 626536ad
Commit 626536ad authored Sep 05, 2018 by Nick Korovaiko Committed by Robert Kimball Sep 05, 2018
11 changed files
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -128,7 +128,6 @@ set (SRC
    pass/nop_elimination.cpp
    pass/pass.cpp
    pass/reshape_elimination.cpp
-    pass/result_copy_elimination.cpp
    pass/zero_dim_tensor_elimination.cpp
    pass/validate_graph.cpp
    pass/visualize_tree.cpp

--- a/src/ngraph/op/result.cpp
+++ b/src/ngraph/op/result.cpp
@@ -47,7 +47,6 @@ shared_ptr<Node> op::Result::copy_with_new_args(const NodeVector& new_args) cons
    auto res = make_shared<Result>(new_args.at(0));
    if (res)
    {
-        res->set_needs_copy(m_needs_copy);
        res->set_needs_default_layout(m_needs_default_layout);
    }
    return res;

--- a/src/ngraph/op/result.hpp
+++ b/src/ngraph/op/result.hpp
@@ -38,8 +38,6 @@ namespace ngraph
                copy_with_new_args(const NodeVector& new_args) const override;

            virtual bool is_output() const override { return true; }
-            void set_needs_copy(bool val) { m_needs_copy = val; }
-            bool needs_copy() const { return m_needs_copy; }
            void set_needs_default_layout(bool val) { m_needs_default_layout = val; }
            bool needs_default_layout() const { return m_needs_default_layout; }
        protected:
@@ -47,7 +45,6 @@ namespace ngraph
                                           const NodeVector& deltas) override;

        private:
-            bool m_needs_copy{true};
            bool m_needs_default_layout{false};
        };
    }

--- a/src/ngraph/pass/result_copy_elimination.cpp
+++ b/src/ngraph/pass/result_copy_elimination.cpp
-//*****************************************************************************
-// Copyright 2017-2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#include "result_copy_elimination.hpp"
-#include "ngraph/node.hpp"
-#include "ngraph/op/parameter.hpp"
-#include "ngraph/op/result.hpp"
-#include "ngraph/util.hpp"
-
-bool ngraph::pass::ResultCopyElimination::run_on_function(std::shared_ptr<ngraph::Function> f)
-{
-    std::set<std::shared_ptr<Node>> seen;
-    for (auto res : f->get_results())
-    {
-        auto arg = res->get_argument(0);
-        // we need a copy
-        if (arg->is_parameter() || arg->is_constant())
-        {
-            continue;
-        }
-
-        // TODO: check if broadcast replace op::Result w/ a copy of broadcast node
-
-        // TODO: consider other cases where it's easier to recompute than make a copy
-
-        // we will compute the result directly into output[]
-        if (seen.count(arg) == 0)
-        {
-            res->set_needs_copy(false);
-            seen.insert(arg);
-        }
-    }
-
-    return true;
-}
--- a/src/ngraph/pass/result_copy_elimination.hpp
+++ b/src/ngraph/pass/result_copy_elimination.hpp
-//*****************************************************************************
-// Copyright 2017-2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#pragma once
-
-#include "ngraph/pass/pass.hpp"
-
-namespace ngraph
-{
-    namespace pass
-    {
-        class ResultCopyElimination;
-    }
-}
-
-class ngraph::pass::ResultCopyElimination : public ngraph::pass::FunctionPass
-{
-public:
-    ResultCopyElimination()
-        : FunctionPass()
-    {
-    }
-
-    virtual bool run_on_function(std::shared_ptr<ngraph::Function> f) override;
-};
--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -4231,10 +4231,9 @@ namespace ngraph
            template <>
            void CPU_Emitter::EMITTER_DECL(ngraph::op::Result)
            {
-                const ngraph::op::Result* result = static_cast<const ngraph::op::Result*>(node);
-
-                if (!result->needs_copy())
+                if (args[0].get_name() == out[0].get_name())
                {
+                    writer << "// Skipping generation for " << node->get_name() << "\n";
                    return;
                }


--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -126,7 +126,6 @@
 #include "ngraph/pass/manager.hpp"
 #include "ngraph/pass/memory_layout.hpp"
 #include "ngraph/pass/nop_elimination.hpp"
-#include "ngraph/pass/result_copy_elimination.hpp"
 #include "ngraph/runtime/aligned_buffer.hpp"
 #include "ngraph/runtime/cpu/cpu_backend.hpp"
 #include "ngraph/runtime/cpu/cpu_builder.hpp"
@@ -389,7 +388,6 @@ void runtime::cpu::CPU_ExternalFunction::compile()
    pass_manager.register_pass<runtime::cpu::pass::CPUAssignment>(this);
    pass_manager.register_pass<runtime::cpu::pass::CPULayout>(this);
    pass_manager.register_pass<runtime::cpu::pass::CPUPostLayoutOptimizations>();
-    pass_manager.register_pass<ngraph::pass::ResultCopyElimination>();
    pass_manager.register_pass<ngraph::pass::GetOutputElementElimination>();
    unordered_map<Node*, Node*> node_function_map;
    string common_function_string;
@@ -694,14 +692,15 @@ using namespace ngraph::runtime;
            m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
            m_tensor_roles[tv->get_tensor().get_name()] = CPUTensorRole::OUTPUT;

-            // it should be safe to assign both descriptors to one output*
-            // since needs_copy == false makes `op::Result` an nop
+            //keep assigning different outputs to a result descriptor
+            //op::Result emitter will check if in and out descriptors are the same
+            //and skip a copy
            auto res = std::dynamic_pointer_cast<ngraph::op::Result>(op);
-            if (!res->needs_copy())
+            auto input_node = res->get_inputs().at(0).get_output().get_node();
+            if (!input_node->is_constant() && !input_node->is_parameter())
            {
                shared_ptr<descriptor::TensorView> itv =
                    res->get_inputs().at(0).get_output().get_tensor_view();
-
                auto output_name = ss.str();
                m_variable_name_map[itv->get_tensor().get_name()] = ss.str();
                m_tensor_roles[itv->get_tensor().get_name()] = CPUTensorRole::OUTPUT;
@@ -1142,7 +1141,6 @@ void runtime::cpu::CPU_ExternalFunction::build()
    pass_manager.register_pass<runtime::cpu::pass::CPUAssignment>(this);
    pass_manager.register_pass<runtime::cpu::pass::CPULayout>(this);
    pass_manager.register_pass<runtime::cpu::pass::CPUPostLayoutOptimizations>();
-    pass_manager.register_pass<ngraph::pass::ResultCopyElimination>();
    pass_manager.register_pass<ngraph::pass::GetOutputElementElimination>();
    pass_manager.register_pass<ngraph::pass::Liveness>();
    pass_manager.register_pass<ngraph::pass::MemoryLayout>(size_t(s_memory_pool_alignment), true);
@@ -1238,8 +1236,12 @@ void runtime::cpu::CPU_ExternalFunction::build()
        function_output_index.emplace_back(tensor_data[tv->get_tensor().get_name()], i);
        m_tensor_roles[tv->get_tensor().get_name()] = CPUTensorRole::OUTPUT;

+        //keep assigning different outputs to a result descriptor
+        //op::Result emitter will check if in and out descriptors are the same
+        //and skip a copy
        auto res = std::dynamic_pointer_cast<ngraph::op::Result>(op);
-        if (!res->needs_copy())
+        auto input_node = res->get_inputs().at(0).get_output().get_node();
+        if (!input_node->is_constant() && !input_node->is_parameter())
        {
            shared_ptr<descriptor::TensorView> itv =
                res->get_inputs().at(0).get_output().get_tensor_view();
@@ -1276,6 +1278,7 @@ void runtime::cpu::CPU_ExternalFunction::build()
        }
        vector<TensorViewWrapper> out;
        vector<string> out_names;
+
        for (const descriptor::Output& output : node->get_outputs())
        {
            shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -788,6 +788,12 @@ namespace ngraph
            template <>
            void GPU_Emitter::EMITTER_DECL(ngraph::op::Result)
            {
+                if (args[0].get_name() == out[0].get_name())
+                {
+                    writer << "// Skipping generation for " << node->get_name() << "\n";
+                    return;
+                }
+
                writer.block_begin();
                kernel::emit_memcpyDtD(writer, out[0], args[0]);
                writer.block_end();

--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -533,10 +533,12 @@ void runtime::gpu::GPU_ExternalFunction::emit_functions()
                ss << "((" << type << "*)(outputs[" << i << "]))";
                m_variable_name_map[tv->get_tensor().get_name()] = ss.str();

-                // it should be safe to assign both descriptors to one output*
-                // since needs_copy == false makes `op::Result` an nop
                auto res = dynamic_pointer_cast<ngraph::op::Result>(op);
-                if (!res->needs_copy())
+                //keep assigning different outputs to a result descriptor
+                //op::Result emitter will check if in and out descriptors are the same
+                //and skip a copy
+                auto input_node = res->get_inputs().at(0).get_output().get_node();
+                if (!input_node->is_constant() && !input_node->is_parameter())
                {
                    shared_ptr<descriptor::TensorView> itv =
                        res->get_inputs().at(0).get_output().get_tensor_view();
@@ -644,8 +646,6 @@ void runtime::gpu::GPU_ExternalFunction::compile()
        m_shared_context->m_primitive_emitter->get_memory_allocator());

    m_pass_manager.register_pass<ngraph::pass::LikeReplacement>();
-    m_pass_manager.register_pass<ngraph::pass::ResultCopyElimination>();
-
    m_pass_manager
        .register_pass<ngraph::pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();


--- a/src/ngraph/runtime/gpu/gpu_external_function.hpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.hpp
@@ -31,7 +31,6 @@
 #include "ngraph/pass/liveness.hpp"
 #include "ngraph/pass/manager.hpp"
 #include "ngraph/pass/memory_layout.hpp"
-#include "ngraph/pass/result_copy_elimination.hpp"
 #include "ngraph/runtime/gpu/gpu_backend.hpp"
 #include "ngraph/runtime/gpu/gpu_call_frame.hpp"
 #include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"

--- a/test/autodiff.in.cpp
+++ b/test/autodiff.in.cpp
@@ -1128,7 +1128,7 @@ NGRAPH_TEST(${BACKEND_NAME}, backwards_select_nested)
        auto X0 = make_shared<op::Parameter>(element::boolean, shape);
        auto X1 = make_shared<op::Parameter>(element::f32, shape);
        auto X2 = make_shared<op::Parameter>(element::f32, shape);
-        return make_shared<Function>(make_shared<op::Select>(X0, X1 + X2, X2 - X1),
+        return make_shared<Function>(make_shared<op::Select>(X0, X2 + X1, X2 - X1),
                                     std::vector<std::shared_ptr<op::Parameter>>{X0, X1, X2});
    };