Commit 5d8687d0 authored by Chris Sullivan's avatar Chris Sullivan Committed by Robert Kimball

GPU logical reshape pass-through (#1534)

* Add Reshape passthrough when reshape is logical only.

* Move address conditional to RT in case reference is different but both pointer values are the same.

* Add exclusion to common function collection pass so that tensor name check
is applied during emission for ops that pass inputs through.

* Add propagate_in_place_input/output to handle
propagating inputs and outputs through the non-destructive ops in the graph.

* Remove references to direct execution.

* Remove checks for in-place transpose.

* Formatting.
parent 9a924c17
......@@ -46,15 +46,24 @@ bool pass::CommonFunctionCollection::run_on_module(vector<shared_ptr<Function>>&
const string function_name = "__f__";
for (const shared_ptr<Function>& current_function : functions)
{
list<shared_ptr<Node>> op_list = current_function->get_ordered_ops();
for (const shared_ptr<Node>& op : op_list)
for (const shared_ptr<Node>& n : current_function->get_ordered_ops())
{
if (op->is_constant() || op->is_parameter())
if (n->is_constant() || n->is_parameter())
{
continue;
}
if (auto op = std::dynamic_pointer_cast<op::Op>(n))
{
auto annotations = op->get_op_annotations();
// If an op is passed through, do not add it to the common function
// collection so that the emitter can decide to eliminate it if desired
if (annotations && annotations->get_in_place_oi_pairs().size() > 0)
{
continue;
}
}
Node& node = *op;
Node& node = *n;
// First emit the op as a function, something like this:
// static void __f__(float* _arg0, float *_out1)
......
......@@ -526,17 +526,23 @@ namespace ngraph
return;
}
auto reshape = static_cast<const op::Reshape*>(node);
if (out[0].get_name() == args[0].get_name())
{
writer << "// Logical reshape eliminated\n";
return;
}
writer.block_begin();
auto arg_shape = args[0].get_shape();
auto arg_rank = arg_shape.size();
auto result_shape = out[0].get_shape();
auto input_order = reshape->get_input_order();
bool same_layout = is_sorted(input_order.begin(), input_order.end());
size_t result_shape_product = shape_size(result_shape);
// If there is no layout change or we are just going from 1^n to 1^m or a zero-size tensor,
// we can just copy.
if (same_layout || result_shape_product < 2)
if (!reshape->get_is_transpose() || result_shape_product < 2)
{
kernel::emit_memcpyDtD(writer, out[0], args[0]);
}
......
......@@ -517,6 +517,7 @@ void runtime::gpu::GPU_ExternalFunction::emit_functions()
stringstream ss;
ss << "((" << type << "*)(inputs[" << arg_index << "]))";
m_variable_name_map[tv->get_name()] = ss.str();
propagate_in_place_input(&param->get_outputs().at(i), ss.str());
arg_index++;
}
}
......@@ -540,7 +541,9 @@ void runtime::gpu::GPU_ExternalFunction::emit_functions()
{
shared_ptr<descriptor::Tensor> itv =
res->get_inputs().at(0).get_output().get_tensor_ptr();
m_variable_name_map[itv->get_name()] = ss.str();
auto output_name = ss.str();
m_variable_name_map[itv->get_name()] = output_name;
propagate_in_place_output(&(res->get_inputs().at(0).get_output()), output_name);
}
}
......@@ -850,3 +853,85 @@ string runtime::gpu::GPU_ExternalFunction::strip_comments(const string& s) const
}
return out.str();
}
void runtime::gpu::GPU_ExternalFunction::propagate_in_place_input(
ngraph::descriptor::Output* output, std::string input_name)
{
std::deque<ngraph::descriptor::Output*> stack;
stack.push_front(output);
while (stack.size() > 0)
{
ngraph::descriptor::Output* it = stack.front();
stack.pop_front();
for (auto input : it->get_inputs())
{
auto c_op = std::dynamic_pointer_cast<ngraph::op::Op>(input->get_node());
if (!c_op || c_op->is_output())
{
continue;
}
if (auto op_annotations = c_op->get_op_annotations())
{
for (auto oi_pair : op_annotations->get_in_place_oi_pairs())
{
if (oi_pair.input == input->get_index() && !oi_pair.destructive)
{
size_t output_index = oi_pair.output;
auto& output_tensor = c_op->get_outputs().at(output_index).get_tensor();
m_variable_name_map[output_tensor.get_name()] = input_name;
NGRAPH_DEBUG << "GPU codegen: Forwarding " << input_name << " through "
<< output_tensor.get_name();
stack.push_back(&c_op->get_outputs().at(output_index));
}
}
}
}
}
}
void runtime::gpu::GPU_ExternalFunction::propagate_in_place_output(
ngraph::descriptor::Output* res_src_output, std::string output_name)
{
// we start with a particular output
// which is an argument to a given op::Result
size_t offset = res_src_output->get_tensor().get_pool_offset();
auto it = res_src_output;
bool propagate_further = false;
do
{
propagate_further = false;
auto arg = std::dynamic_pointer_cast<ngraph::op::Op>(it->get_node());
if (!arg)
{
break;
}
if (auto op_annotations = arg->get_op_annotations())
{
for (auto oi_pair : op_annotations->get_in_place_oi_pairs())
{
if (oi_pair.output == it->get_index())
{
size_t input_index = oi_pair.input;
auto& input_tensor = arg->get_inputs().at(input_index).get_tensor();
auto tmp_node = arg->get_inputs().at(input_index).get_output().get_node();
if (input_tensor.get_pool_offset() == offset && !tmp_node->is_parameter() &&
!tmp_node->is_constant())
{
NGRAPH_DEBUG << "Reusing " << output_name << " for "
<< input_tensor.get_name();
m_variable_name_map[input_tensor.get_name()] = output_name;
it = &arg->get_inputs().at(input_index).get_output();
propagate_further = true;
}
}
}
}
} while (propagate_further);
}
......@@ -81,6 +81,14 @@ namespace ngraph
EntryPoint m_compiled_function;
private:
// For non-destructive passthrough kernels, propagate function
// input buffers to internal ops
void propagate_in_place_input(ngraph::descriptor::Output* output,
std::string input_name);
// For in-place kernels, propagate function output buffers to
// internal ops
void propagate_in_place_output(ngraph::descriptor::Output* res_src_output,
std::string output_name);
void emit_header();
void emit_timer_functions();
void emit_constant_declarations();
......
......@@ -22,6 +22,7 @@
#include "gpu_layout.hpp"
#include "ngraph/op/replace_slice.hpp"
#include "ngraph/op/reshape.hpp"
#include "ngraph/runtime/gpu/gpu_op_annotations.hpp"
using namespace std;
......@@ -54,6 +55,30 @@ namespace ngraph
rep_slice->set_op_annotations(op_annotations);
}
}
template <>
void GPULayout::LAYOUT_DECL(ngraph::op::Reshape)
{
auto reshape = static_cast<ngraph::op::Reshape*>(node.get());
if (reshape->get_is_transpose())
{
return;
}
// Shape change only, tensor in native layout can be
// forwarded to output
auto op_annotations = reshape->get_op_annotations();
if (op_annotations)
{
// pass-through
op_annotations->add_in_place_oi_pair({0, 0, false});
}
else
{
op_annotations = std::make_shared<ngraph::runtime::gpu::GPUOpAnnotations>();
// pass-through
op_annotations->add_in_place_oi_pair({0, 0, false});
reshape->set_op_annotations(op_annotations);
}
}
}
}
}
......@@ -64,6 +89,7 @@ namespace ngraph
static const runtime::gpu::pass::LayoutOpMap s_dispatcher{
{TI(ngraph::op::ReplaceSlice),
&runtime::gpu::pass::GPULayout::layout<ngraph::op::ReplaceSlice>},
{TI(ngraph::op::Reshape), &runtime::gpu::pass::GPULayout::layout<ngraph::op::Reshape>},
};
bool runtime::gpu::pass::GPULayout::run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment