GPU logical reshape pass-through (#1534)

* Add Reshape passthrough when reshape is logical only. * Move address conditional to RT in case reference is different but both pointer values are the same. * Add exclusion to common function collection pass so that tensor name check is applied during emission for ops that pass inputs through. * Add propagate_in_place_input/output to handle propagating inputs and outputs through the non-destructive ops in the graph. * Remove references to direct execution. * Remove checks for in-place transpose. * Formatting.

GPU logical reshape pass-through (#1534)
* Add Reshape passthrough when reshape is logical only. * Move address conditional to RT in case reference is different but both pointer values are the same. * Add exclusion to common function collection pass so that tensor name check is applied during emission for ops that pass inputs through. * Add propagate_in_place_input/output to handle propagating inputs and outputs through the non-destructive ops in the graph. * Remove references to direct execution. * Remove checks for in-place transpose. * Formatting.
5d8687d0 · Chris Sullivan · Robert Kimball · 9a924c17 · 5d8687d0 · 5d8687d0
Commit 5d8687d0 authored Sep 14, 2018 by Chris Sullivan Committed by Robert Kimball Sep 14, 2018
5 changed files
--- a/src/ngraph/pass/common_function_collection.cpp
+++ b/src/ngraph/pass/common_function_collection.cpp
@@ -46,15 +46,24 @@ bool pass::CommonFunctionCollection::run_on_module(vector<shared_ptr<Function>>&
    const string function_name = "__f__";
    for (const shared_ptr<Function>& current_function : functions)
    {
-        list<shared_ptr<Node>> op_list = current_function->get_ordered_ops();
-        for (const shared_ptr<Node>& op : op_list)
+        for (const shared_ptr<Node>& n : current_function->get_ordered_ops())
        {
-            if (op->is_constant() || op->is_parameter())
+            if (n->is_constant() || n->is_parameter())
            {
                continue;
            }
+            if (auto op = std::dynamic_pointer_cast<op::Op>(n))
+            {
+                auto annotations = op->get_op_annotations();
+                // If an op is passed through, do not add it to the common function
+                // collection so that the emitter can decide to eliminate it if desired
+                if (annotations && annotations->get_in_place_oi_pairs().size() > 0)
+                {
+                    continue;
+                }
+            }

-            Node& node = *op;
+            Node& node = *n;

            // First emit the op as a function, something like this:
            // static void __f__(float* _arg0, float *_out1)

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -526,17 +526,23 @@ namespace ngraph
                    return;
                }
                auto reshape = static_cast<const op::Reshape*>(node);
+
+                if (out[0].get_name() == args[0].get_name())
+                {
+                    writer << "// Logical reshape eliminated\n";
+                    return;
+                }
+
                writer.block_begin();
                auto arg_shape = args[0].get_shape();
                auto arg_rank = arg_shape.size();
                auto result_shape = out[0].get_shape();
                auto input_order = reshape->get_input_order();
-                bool same_layout = is_sorted(input_order.begin(), input_order.end());
                size_t result_shape_product = shape_size(result_shape);

                // If there is no layout change or we are just going from 1^n to 1^m or a zero-size tensor,
                // we can just copy.
-                if (same_layout || result_shape_product < 2)
+                if (!reshape->get_is_transpose() || result_shape_product < 2)
                {
                    kernel::emit_memcpyDtD(writer, out[0], args[0]);
                }

--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -517,6 +517,7 @@ void runtime::gpu::GPU_ExternalFunction::emit_functions()
                    stringstream ss;
                    ss << "((" << type << "*)(inputs[" << arg_index << "]))";
                    m_variable_name_map[tv->get_name()] = ss.str();
+                    propagate_in_place_input(&param->get_outputs().at(i), ss.str());
                    arg_index++;
                }
            }
@@ -540,7 +541,9 @@ void runtime::gpu::GPU_ExternalFunction::emit_functions()
                {
                    shared_ptr<descriptor::Tensor> itv =
                        res->get_inputs().at(0).get_output().get_tensor_ptr();
-                    m_variable_name_map[itv->get_name()] = ss.str();
+                    auto output_name = ss.str();
+                    m_variable_name_map[itv->get_name()] = output_name;
+                    propagate_in_place_output(&(res->get_inputs().at(0).get_output()), output_name);
                }
            }

@@ -850,3 +853,85 @@ string runtime::gpu::GPU_ExternalFunction::strip_comments(const string& s) const
    }
    return out.str();
 }
+
+void runtime::gpu::GPU_ExternalFunction::propagate_in_place_input(
+    ngraph::descriptor::Output* output, std::string input_name)
+{
+    std::deque<ngraph::descriptor::Output*> stack;
+    stack.push_front(output);
+
+    while (stack.size() > 0)
+    {
+        ngraph::descriptor::Output* it = stack.front();
+        stack.pop_front();
+        for (auto input : it->get_inputs())
+        {
+            auto c_op = std::dynamic_pointer_cast<ngraph::op::Op>(input->get_node());
+            if (!c_op || c_op->is_output())
+            {
+                continue;
+            }
+
+            if (auto op_annotations = c_op->get_op_annotations())
+            {
+                for (auto oi_pair : op_annotations->get_in_place_oi_pairs())
+                {
+                    if (oi_pair.input == input->get_index() && !oi_pair.destructive)
+                    {
+                        size_t output_index = oi_pair.output;
+                        auto& output_tensor = c_op->get_outputs().at(output_index).get_tensor();
+
+                        m_variable_name_map[output_tensor.get_name()] = input_name;
+
+                        NGRAPH_DEBUG << "GPU codegen: Forwarding " << input_name << " through "
+                                     << output_tensor.get_name();
+                        stack.push_back(&c_op->get_outputs().at(output_index));
+                    }
+                }
+            }
+        }
+    }
+}
+
+void runtime::gpu::GPU_ExternalFunction::propagate_in_place_output(
+    ngraph::descriptor::Output* res_src_output, std::string output_name)
+{
+    // we start with a particular output
+    // which is an argument to a given op::Result
+    size_t offset = res_src_output->get_tensor().get_pool_offset();
+    auto it = res_src_output;
+
+    bool propagate_further = false;
+    do
+    {
+        propagate_further = false;
+        auto arg = std::dynamic_pointer_cast<ngraph::op::Op>(it->get_node());
+        if (!arg)
+        {
+            break;
+        }
+        if (auto op_annotations = arg->get_op_annotations())
+        {
+            for (auto oi_pair : op_annotations->get_in_place_oi_pairs())
+            {
+                if (oi_pair.output == it->get_index())
+                {
+                    size_t input_index = oi_pair.input;
+                    auto& input_tensor = arg->get_inputs().at(input_index).get_tensor();
+                    auto tmp_node = arg->get_inputs().at(input_index).get_output().get_node();
+                    if (input_tensor.get_pool_offset() == offset && !tmp_node->is_parameter() &&
+                        !tmp_node->is_constant())
+                    {
+                        NGRAPH_DEBUG << "Reusing " << output_name << " for "
+                                     << input_tensor.get_name();
+
+                        m_variable_name_map[input_tensor.get_name()] = output_name;
+
+                        it = &arg->get_inputs().at(input_index).get_output();
+                        propagate_further = true;
+                    }
+                }
+            }
+        }
+    } while (propagate_further);
+}
--- a/src/ngraph/runtime/gpu/gpu_external_function.hpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.hpp
@@ -81,6 +81,14 @@ namespace ngraph
                EntryPoint m_compiled_function;

            private:
+                // For non-destructive passthrough kernels, propagate function
+                // input buffers to internal ops
+                void propagate_in_place_input(ngraph::descriptor::Output* output,
+                                              std::string input_name);
+                // For in-place kernels, propagate function output buffers to
+                // internal ops
+                void propagate_in_place_output(ngraph::descriptor::Output* res_src_output,
+                                               std::string output_name);
                void emit_header();
                void emit_timer_functions();
                void emit_constant_declarations();

--- a/src/ngraph/runtime/gpu/pass/gpu_layout.cpp
+++ b/src/ngraph/runtime/gpu/pass/gpu_layout.cpp
@@ -22,6 +22,7 @@

 #include "gpu_layout.hpp"
 #include "ngraph/op/replace_slice.hpp"
+#include "ngraph/op/reshape.hpp"
 #include "ngraph/runtime/gpu/gpu_op_annotations.hpp"

 using namespace std;
@@ -54,6 +55,30 @@ namespace ngraph
                        rep_slice->set_op_annotations(op_annotations);
                    }
                }
+                template <>
+                void GPULayout::LAYOUT_DECL(ngraph::op::Reshape)
+                {
+                    auto reshape = static_cast<ngraph::op::Reshape*>(node.get());
+                    if (reshape->get_is_transpose())
+                    {
+                        return;
+                    }
+                    // Shape change only, tensor in native layout can be
+                    // forwarded to output
+                    auto op_annotations = reshape->get_op_annotations();
+                    if (op_annotations)
+                    {
+                        // pass-through
+                        op_annotations->add_in_place_oi_pair({0, 0, false});
+                    }
+                    else
+                    {
+                        op_annotations = std::make_shared<ngraph::runtime::gpu::GPUOpAnnotations>();
+                        // pass-through
+                        op_annotations->add_in_place_oi_pair({0, 0, false});
+                        reshape->set_op_annotations(op_annotations);
+                    }
+                }
            }
        }
    }
@@ -64,6 +89,7 @@ namespace ngraph
 static const runtime::gpu::pass::LayoutOpMap s_dispatcher{
    {TI(ngraph::op::ReplaceSlice),
     &runtime::gpu::pass::GPULayout::layout<ngraph::op::ReplaceSlice>},
+    {TI(ngraph::op::Reshape), &runtime::gpu::pass::GPULayout::layout<ngraph::op::Reshape>},
 };

 bool runtime::gpu::pass::GPULayout::run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes)