Commit 260cb90d authored by Fenglei's avatar Fenglei Committed by Scott Cyphers

gpu_external_function and gpu constant memory refactor (#1189)

* refactor external function

* wokring version

* fix bug

* add emit_fucntions, emit_declare_constants, emit_declare_functions

* add std::

* add functions declaration

* fix bugs

* fix bugs

* separate temp memory allocation and release

* add invoke_constant_ptr function, clean up outputs for function

* fix bugs, compiled ok

* add ctx to emit_declare_constant

* cleanup code, code style

* remove using std, code style

* revert std changes

* change function names based Chris's comments

* add ResultCopyElimination to pass_manager

* clang format
parent 2c345798
...@@ -20,16 +20,9 @@ ...@@ -20,16 +20,9 @@
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <cudnn.h> #include <cudnn.h>
#include <fstream> #include <fstream>
#include <memory>
#include <string> #include <string>
#include <tuple> #include <tuple>
#include <typeindex>
#include <typeinfo>
#include <unordered_map>
#include "ngraph/codegen/code_writer.hpp"
#include "ngraph/codegen/compiler.hpp"
#include "ngraph/codegen/execution_engine.hpp"
#include "ngraph/descriptor/input.hpp" #include "ngraph/descriptor/input.hpp"
#include "ngraph/descriptor/layout/dense_tensor_view_layout.hpp" #include "ngraph/descriptor/layout/dense_tensor_view_layout.hpp"
#include "ngraph/descriptor/output.hpp" #include "ngraph/descriptor/output.hpp"
...@@ -104,13 +97,7 @@ ...@@ -104,13 +97,7 @@
#include "ngraph/op/sum.hpp" #include "ngraph/op/sum.hpp"
#include "ngraph/op/tan.hpp" #include "ngraph/op/tan.hpp"
#include "ngraph/op/tanh.hpp" #include "ngraph/op/tanh.hpp"
#include "ngraph/pass/assign_layout.hpp"
#include "ngraph/pass/dump_sorted.hpp"
#include "ngraph/pass/liveness.hpp"
#include "ngraph/pass/manager.hpp"
#include "ngraph/pass/memory_layout.hpp"
#include "ngraph/runtime/gpu/gpu_backend.hpp" #include "ngraph/runtime/gpu/gpu_backend.hpp"
#include "ngraph/runtime/gpu/gpu_call_frame.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp" #include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
#include "ngraph/runtime/gpu/gpu_emitter.hpp" #include "ngraph/runtime/gpu/gpu_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_external_function.hpp" #include "ngraph/runtime/gpu/gpu_external_function.hpp"
...@@ -252,11 +239,12 @@ static const runtime::gpu::OpMap dispatcher{ ...@@ -252,11 +239,12 @@ static const runtime::gpu::OpMap dispatcher{
runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction( runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
const shared_ptr<ngraph::Function>& function, bool release_function) const shared_ptr<ngraph::Function>& function, bool release_function)
: m_compiled_function(nullptr) : m_compiled_function(nullptr)
, m_emit_timing(false) , m_ctx(new GPURuntimeContext)
, m_function(function) , m_function(function)
, m_release_function(release_function) , m_emit_timing(false)
, m_is_compiled(false) , m_is_compiled(false)
, m_ctx(new GPURuntimeContext) , m_release_function(release_function)
, m_temporaries_used(false)
{ {
// Create context use driver API and make it current, the runtime call will pickup the context // Create context use driver API and make it current, the runtime call will pickup the context
// http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
...@@ -289,37 +277,10 @@ runtime::gpu::GPU_ExternalFunction::~GPU_ExternalFunction() ...@@ -289,37 +277,10 @@ runtime::gpu::GPU_ExternalFunction::~GPU_ExternalFunction()
delete m_ctx->compiled_kernel_pool; delete m_ctx->compiled_kernel_pool;
} }
void runtime::gpu::GPU_ExternalFunction::compile() void runtime::gpu::GPU_ExternalFunction::emit_header()
{ {
if (m_is_compiled) m_writer += R"(
{ // Generated by the nGraph GPU backend
return;
}
m_primitive_emitter.reset(new GPUPrimitiveEmitter());
string function_name = m_function->get_name();
string dump_filename = file_util::path_join(s_output_dir, function_name + "_ops.txt");
pass::Manager pass_manager;
// pass_manager.register_pass<pass::TopologicalSort>();
// For now, just make everyone row-major.
pass_manager.register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
pass_manager.register_pass<pass::Liveness>();
pass_manager.register_pass<pass::MemoryLayout>(64);
pass_manager.register_pass<pass::DumpSorted>(dump_filename);
pass_manager.run_passes(m_function);
unordered_map<shared_ptr<Function>, list<shared_ptr<Node>>> function_ordered_ops;
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
{
function_ordered_ops.insert({current_function, current_function->get_ordered_ops()});
}
codegen::CodeWriter writer;
writer +=
R"(// Generated by the nGraph GPU backend
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
...@@ -348,9 +309,9 @@ void runtime::gpu::GPU_ExternalFunction::compile() ...@@ -348,9 +309,9 @@ void runtime::gpu::GPU_ExternalFunction::compile()
#include "ngraph/util.hpp" #include "ngraph/util.hpp"
)"; )";
string pch_header_source = writer.get_code(); m_pch_header_source = m_writer.get_code();
writer += R"( m_writer += R"(
using namespace ngraph; using namespace ngraph;
using namespace ngraph::runtime; using namespace ngraph::runtime;
using namespace std; using namespace std;
...@@ -360,15 +321,19 @@ using namespace std; ...@@ -360,15 +321,19 @@ using namespace std;
// which is enabled because the JIT uses it as the default mechanism // which is enabled because the JIT uses it as the default mechanism
// to register cleanup handlers. We use it, and not atexit(), because // to register cleanup handlers. We use it, and not atexit(), because
// atexit() happens too late, when the JIT is no longer alive // atexit() happens too late, when the JIT is no longer alive
writer << "void *__dso_handle = 0;\n\n"; m_writer << "void *__dso_handle = 0;\n\n";
}
void runtime::gpu::GPU_ExternalFunction::emit_timer_functions()
{
if (m_emit_timing) if (m_emit_timing)
{ {
writer << "// Declare debug timers\n"; m_writer << "// Declare debug timers\n";
vector<string> names; vector<string> names;
size_t index = 0; size_t index = 0;
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions()) for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
{ {
for (shared_ptr<Node> node : function_ordered_ops.at(current_function)) for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
{ {
if (!node->is_parameter() && !node->is_constant()) if (!node->is_parameter() && !node->is_constant())
{ {
...@@ -377,77 +342,114 @@ using namespace std; ...@@ -377,77 +342,114 @@ using namespace std;
} }
} }
} }
writer << "ngraph::stopwatch timers[" << names.size() << "];\n"; m_writer << "ngraph::stopwatch timers[" << names.size() << "];\n";
writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size() m_writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
<< "; }\n"; << "; }\n";
writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n"; m_writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
writer.block_begin(); m_writer.block_begin();
writer << "static const char* timer_names[" << names.size() << "] =\n"; m_writer << "static const char* timer_names[" << names.size() << "] =\n";
writer.block_begin(); m_writer.block_begin();
vector<string> quoted_names; vector<string> quoted_names;
for (const string& name : names) for (const string& name : names)
{ {
quoted_names.push_back("\"" + name + "\""); quoted_names.push_back("\"" + name + "\"");
} }
writer << emit_string_array(quoted_names, 100 - (4 * 2 + 1)); m_writer << emit_string_array(quoted_names, 100 - (4 * 2 + 1));
writer.indent--; m_writer.indent--;
writer << "\n};\n"; m_writer << "\n};\n";
writer << "return timer_names[index];\n"; m_writer << "return timer_names[index];\n";
writer.block_end(); m_writer.block_end();
writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n"; m_writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
writer.block_begin(); m_writer.block_begin();
writer << "return (index < " << names.size() m_writer << "return (index < " << names.size()
<< " ? timers[index].get_total_microseconds() : 0);\n"; << " ? timers[index].get_total_microseconds() : 0);\n";
writer.block_end(); m_writer.block_end();
writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n"; m_writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
writer.block_begin(); m_writer.block_begin();
writer << "return (index < " << names.size() << " ? timers[index].get_call_count() : 0);\n"; m_writer << "return (index < " << names.size()
writer.block_end(); << " ? timers[index].get_call_count() : 0);\n";
writer << "\n"; m_writer.block_end();
m_writer << "\n";
} }
writer << "// Declare all constants\n"; }
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
void runtime::gpu::GPU_ExternalFunction::emit_constant_declarations()
{
m_writer << "// Declare all constants\n";
for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
{ {
for (shared_ptr<Node> node : current_function->get_ordered_ops()) for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
{ {
const op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get()); const op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
if (c) if (c)
{ {
shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view(); shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
auto c_value_strings = c->get_value_strings(); // get an allocator for transient per kernel gpu memory
writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " *" GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
<< tv->get_tensor().get_name() << ";\n"; size_t idx = allocator.reserve_argspace(
writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " " c->get_data_ptr(),
<< tv->get_tensor().get_name() << "_cpu[" << c_value_strings.size() tv->get_tensor().size() * tv->get_tensor().get_element_type().size());
<< "] =\n"; m_writer << "static size_t " << tv->get_tensor().get_name() << "_idx = " << idx
writer << "{\n"; << ";\n";
writer.indent++; m_writer << "static " << tv->get_tensor().get_element_type().c_type_string() << "* "
writer << emit_string_array(c_value_strings, 100 - writer.indent * 4); << tv->get_tensor().get_name() << " = nullptr;\n";
writer.indent--;
writer << "\n};\n\n";
m_variable_name_map[tv->get_tensor().get_name()] = tv->get_tensor().get_name(); m_variable_name_map[tv->get_tensor().get_name()] = tv->get_tensor().get_name();
} }
} }
} }
writer << "// Declare all functions\n"; m_writer << "\nstatic bool is_constant_mem_ptr_null = true;\n\n";
for (shared_ptr<Function> f : pass_manager.get_state().get_functions()) m_writer << "static void invoke_constant_mem_ptr(gpu::GPURuntimeContext* ctx)\n";
m_writer.block_begin();
{ {
writer << "extern \"C\" void " << f->get_name() << "(void** inputs, void** outputs, " m_writer << "if(is_constant_mem_ptr_null)\n";
<< "gpu::GPURuntimeContext* ctx);\n"; m_writer.block_begin();
{
for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
{
for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
{
const op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
if (c)
{
shared_ptr<descriptor::TensorView> tv =
node->get_outputs()[0].get_tensor_view();
m_writer << tv->get_tensor().get_name() << " = reinterpret_cast<"
<< tv->get_tensor().get_element_type().c_type_string()
<< "*>(runtime::gpu::invoke_memory_primitive(ctx, "
<< tv->get_tensor().get_name() << "_idx));\n";
}
}
}
m_writer << "is_constant_mem_ptr_null = false;\n";
}
m_writer.block_end();
} }
writer << "\n"; m_writer.block_end();
}
void runtime::gpu::GPU_ExternalFunction::emit_function_declarations()
{
m_writer << "// Declare all functions\n";
for (shared_ptr<Function> f : m_pass_manager.get_state().get_functions())
{
m_writer << "extern \"C\" void " << f->get_name() << "(void** inputs, void** outputs, "
<< "gpu::GPURuntimeContext* ctx);\n";
}
m_writer << "\n";
}
void runtime::gpu::GPU_ExternalFunction::collect_unique_functions()
{
// This for loop creates a collection of functions that are called more than once // This for loop creates a collection of functions that are called more than once
// and emitting them as globally callable functions. // and emitting them as globally callable functions.
// ops implement the is_functionally_identical method // ops implement the is_functionally_identical method
unordered_map<string, string> match_function_map; unordered_map<string, string> match_function_map;
unordered_map<const Node*, string> node_function_map; for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
{ {
list<shared_ptr<Node>> tmp = current_function->get_ordered_ops(); list<shared_ptr<Node>> tmp = m_function_ordered_ops.at(current_function);
if (tmp.size() < 2) if (tmp.size() < 2)
{ {
// Since we are comparing ops there must be at least two ops to proceed. // Since we are comparing ops there must be at least two ops to proceed.
...@@ -481,13 +483,62 @@ using namespace std; ...@@ -481,13 +483,62 @@ using namespace std;
match_function_name = "func_" + node.get_name(); match_function_name = "func_" + node.get_name();
emitted_function.replace(offset, 5, match_function_name); emitted_function.replace(offset, 5, match_function_name);
match_function_map.insert({match_function, match_function_name}); match_function_map.insert({match_function, match_function_name});
writer << emitted_function << "\n"; m_writer << emitted_function << "\n";
} }
node_function_map.insert({&node, match_function_name}); m_node_function_map.insert({&node, match_function_name});
} }
} }
}
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions()) void runtime::gpu::GPU_ExternalFunction::emit_temp_mem_pool_allocation(
shared_ptr<Function> current_function)
{
m_temporaries_used = false;
size_t worst_case_tmp_size = 0;
for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
{
if (node->liveness_new_list.size() > 0)
{
m_temporaries_used = true;
for (descriptor::Tensor* tensor : node->liveness_new_list)
{
worst_case_tmp_size += tensor->size();
}
}
}
if (m_temporaries_used)
{
size_t temp_pool_size = current_function->get_temporary_pool_size();
m_writer << "// Allocate the memory pool\n";
// TODO memory pool malloc.
m_writer << "void* pool_base_ptr = ngraph::runtime::gpu::create_gpu_buffer("
<< temp_pool_size << ");\n";
// Add temporaries to the variable name map
for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
{
for (descriptor::Tensor* tensor : node->liveness_new_list)
{
stringstream ss;
ss << "((" << tensor->get_element_type().c_type_string()
<< "*)((char *)pool_base_ptr + " << tensor->get_pool_offset() << "))";
m_variable_name_map[tensor->get_name()] = ss.str();
}
}
}
}
void runtime::gpu::GPU_ExternalFunction::emit_temp_mem_pool_release()
{
if (m_temporaries_used)
{
m_writer << "ngraph::runtime::gpu::free_gpu_buffer(pool_base_ptr);\n";
}
}
void runtime::gpu::GPU_ExternalFunction::emit_functions()
{
for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
{ {
set<string> output_names; set<string> output_names;
for (shared_ptr<Node> op : current_function->get_results()) for (shared_ptr<Node> op : current_function->get_results())
...@@ -496,7 +547,7 @@ using namespace std; ...@@ -496,7 +547,7 @@ using namespace std;
output_names.insert(tv->get_tensor().get_name()); output_names.insert(tv->get_tensor().get_name());
} }
set<descriptor::TensorView*> constants; set<descriptor::TensorView*> constants;
for (shared_ptr<Node> node : current_function->get_ordered_ops()) for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
{ {
if (dynamic_cast<ngraph::op::Constant*>(node.get())) if (dynamic_cast<ngraph::op::Constant*>(node.get()))
{ {
...@@ -505,258 +556,192 @@ using namespace std; ...@@ -505,258 +556,192 @@ using namespace std;
} }
} }
writer << "extern \"C\" void " << current_function->get_name(); m_writer << "extern \"C\" void " << current_function->get_name();
writer << "(void** inputs, void** outputs, " m_writer << "(void** inputs, void** outputs, "
<< "gpu::GPURuntimeContext* ctx)\n"; << "gpu::GPURuntimeContext* ctx)\n";
writer << "{\n"; m_writer.block_begin();
writer.indent++;
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{ {
const op::Constant* c = dynamic_cast<op::Constant*>(node.get()); //set constant pointers during the first run
if (c) m_writer << "invoke_constant_mem_ptr(ctx);\n";
{
shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
writer << "if(" << tv->get_tensor().get_name() << " == NULL)\n";
writer << "{\n";
writer.indent++;
writer << tv->get_tensor().get_name() << " = ("
<< tv->get_tensor().get_element_type().c_type_string()
<< " *) runtime::gpu::create_gpu_buffer(" << tv->get_tensor().size()
<< ");\n";
writer << "runtime::gpu::cuda_memcpyHtD(" << tv->get_tensor().get_name() << ", "
<< tv->get_tensor().get_name() << "_cpu, " << tv->get_tensor().size()
<< ");\n";
writer.indent--;
writer << "}\n";
}
}
bool temporaries_used = false; //alocate temp memory pool
size_t worst_case_tmp_size = 0; emit_temp_mem_pool_allocation(current_function);
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{ // Add inputs to the variable name map
if (node->liveness_new_list.size() > 0) size_t arg_index = 0;
{ for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
temporaries_used = true;
for (descriptor::Tensor* tensor : node->liveness_new_list)
{
worst_case_tmp_size += tensor->size();
}
}
}
if (temporaries_used)
{
size_t temp_pool_size = current_function->get_temporary_pool_size();
writer << "// Allocate the memory pool\n";
// TODO memory pool malloc.
writer << "void* pool_base_ptr = ngraph::runtime::gpu::create_gpu_buffer("
<< temp_pool_size << ");\n";
// Add temporaries to the variable name map
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{ {
for (descriptor::Tensor* tensor : node->liveness_new_list) for (size_t i = 0; i < param->get_output_size(); ++i)
{ {
shared_ptr<descriptor::TensorView> tv = param->get_output_tensor_view(i);
const element::Type& et = tv->get_tensor_view_type()->get_element_type();
string type = et.c_type_string();
stringstream ss; stringstream ss;
ss << "((" << tensor->get_element_type().c_type_string() ss << "((" << type << "*)(inputs[" << arg_index << "]))";
<< "*)((char *)pool_base_ptr + " << tensor->get_pool_offset() << "))"; m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
m_variable_name_map[tensor->get_name()] = ss.str(); arg_index++;
} }
} }
}
// Add inputs to the variable name map // Add outputs to the variable name map
size_t arg_index = 0; for (size_t i = 0; i < current_function->get_output_size(); ++i)
for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
{
for (size_t i = 0; i < param->get_output_size(); ++i)
{ {
shared_ptr<descriptor::TensorView> tv = param->get_output_tensor_view(i); shared_ptr<Node> op = current_function->get_output_op(i);
const element::Type& et = tv->get_tensor_view_type()->get_element_type(); shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
string type = et.c_type_string(); string type = tv->get_tensor_view_type()->get_element_type().c_type_string();
stringstream ss; stringstream ss;
ss << "((" << type << "*)(inputs[" << arg_index << "]))"; ss << "((" << type << "*)(outputs[" << i << "]))";
m_variable_name_map[tv->get_tensor().get_name()] = ss.str(); m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
arg_index++;
}
}
// create output alias map //it should be safe to assign both descriptors to one output*
size_t output_index = 0; //since needs_copy == false makes `op::Result` an nop
unordered_map<descriptor::TensorView*, vector<size_t>> output_alias_map; auto res = dynamic_pointer_cast<ngraph::op::Result>(op);
vector<size_t> aliases; if (!res->needs_copy())
for (size_t i = 0; i < current_function->get_output_size(); ++i) {
{ shared_ptr<descriptor::TensorView> itv =
shared_ptr<Node> op = current_function->get_output_op(i); res->get_inputs().at(0).get_output().get_tensor_view();
shared_ptr<descriptor::TensorView> otv = op->get_output_tensor_view(); m_variable_name_map[itv->get_tensor().get_name()] = ss.str();
vector<size_t>& al = output_alias_map[otv.get()]; }
al.push_back(output_index);
if (al.size() > 1)
{
aliases.push_back(output_index);
} }
output_index++;
}
// Add outputs to the variable name map for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
output_index = 0;
for (size_t i = 0; i < current_function->get_output_size(); ++i)
{
shared_ptr<Node> op = current_function->get_output_op(i);
shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
const element::Type& et = tv->get_tensor_view_type()->get_element_type();
bool parameter_as_output = false;
for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
{ {
for (const descriptor::Output& pout : param->get_outputs()) auto& n =
*node; // Work around a compiler warning (*node inside typeid may have effects
// with shared pointers, which is fine here but clang doesn't like it.)
auto handler = dispatcher.find(type_index(typeid(n)));
if (handler == dispatcher.end())
{ {
shared_ptr<descriptor::TensorView> ptv = pout.get_tensor_view(); throw ngraph_error("Unhandled op during code generation : " +
if (tv == ptv) node->description());
{
parameter_as_output = true;
writer << "ngraph::runtime::gpu::cuda_memcpyDtD(reinterpret_cast<"
<< et.c_type_string() << "*>(outputs[" << output_index << "]), "
<< m_variable_name_map[ptv->get_tensor().get_name()] << ", "
<< ptv->get_tensor().size() << ");\n";
break;
}
} }
} vector<GPU_TensorViewWrapper> in;
if (!parameter_as_output && !contains(aliases, output_index)) vector<string> node_input_names;
{ vector<string> node_output_names;
if (contains(constants, tv.get())) for (const descriptor::Input& input : node->get_inputs())
{ {
writer << "ngraph::runtime::gpu::cuda_memcpyHtD(outputs[" << output_index const descriptor::Output& output = input.get_output();
<< "], " << tv->get_tensor().get_name() << ", " shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
<< tv->get_tensor().size() << ");\n"; in.push_back(GPU_TensorViewWrapper(
tv, m_variable_name_map[tv->get_tensor().get_name()]));
node_input_names.emplace_back(tv->get_tensor().get_name());
} }
else vector<GPU_TensorViewWrapper> out;
for (const descriptor::Output& output : node->get_outputs())
{ {
string type = et.c_type_string(); shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
stringstream ss; out.push_back(GPU_TensorViewWrapper(
ss << "((" << type << "*)(outputs[" << output_index << "]))"; tv, m_variable_name_map[tv->get_tensor().get_name()]));
m_variable_name_map[tv->get_tensor().get_name()] = ss.str(); node_output_names.emplace_back(tv->get_tensor().get_name());
} }
}
output_index++;
}
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
auto& n = *node; // Work around a compiler warning (*node inside typeid may have effects
// with shared pointers, which is fine here but clang doesn't like it.)
auto handler = dispatcher.find(type_index(typeid(n)));
if (handler == dispatcher.end())
{
throw ngraph_error("Unhandled op during code generation : " + node->description());
}
vector<GPU_TensorViewWrapper> in;
vector<string> node_input_names;
vector<string> node_output_names;
for (const descriptor::Input& input : node->get_inputs())
{
const descriptor::Output& output = input.get_output();
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
in.push_back(
GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
node_input_names.emplace_back(tv->get_tensor().get_name());
}
vector<GPU_TensorViewWrapper> out;
for (const descriptor::Output& output : node->get_outputs())
{
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
out.push_back(
GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
node_output_names.emplace_back(tv->get_tensor().get_name());
}
// Emit function description comment
if (!node->is_parameter() && !node->is_constant())
{
writer << "\n// " << node->get_name() << "(";
vector<string> parameter_nodes = node_input_names;
parameter_nodes.insert(
parameter_nodes.end(), node_output_names.begin(), node_output_names.end());
writer << join(parameter_nodes);
writer << ")\n";
}
// Emit operation prologue // Emit function description comment
if (!node->is_parameter() && !node->is_constant()) if (!node->is_parameter() && !node->is_constant())
{
if (m_emit_timing)
{ {
emit_debug_function_entry(writer, node.get(), in, out); m_writer << "\n// " << node->get_name() << "(";
vector<string> parameter_nodes = node_input_names;
parameter_nodes.insert(
parameter_nodes.end(), node_output_names.begin(), node_output_names.end());
m_writer << join(parameter_nodes);
m_writer << ")\n";
emit_debug_function_entry(node.get());
} }
}
// Emit operation body // Emit operation body
string func_name; string func_name;
func_name = node_function_map[node.get()]; func_name = m_node_function_map[node.get()];
if (func_name.empty()) if (func_name.empty())
{
//throw runtime_error("No matching function found for '" + node->get_name() + "'");
handler->second(this, writer, node.get(), in, out);
}
else
{
vector<string> names;
for (const GPU_TensorViewWrapper& tv : in)
{ {
names.push_back(tv.get_name()); //throw runtime_error("No matching function found for '" + node->get_name() + "'");
handler->second(this, m_writer, node.get(), in, out);
} }
for (const GPU_TensorViewWrapper& tv : out) else
{ {
names.push_back(tv.get_name()); vector<string> names;
for (const GPU_TensorViewWrapper& tv : in)
{
names.push_back(tv.get_name());
}
for (const GPU_TensorViewWrapper& tv : out)
{
names.push_back(tv.get_name());
}
names.push_back("ctx");
m_writer << func_name << "(" << join(names) << ");\n";
} }
names.push_back("ctx");
writer << func_name << "(" << join(names) << ");\n";
}
// Emit operation epilogue // Emit operation epilogue
if (!node->is_parameter() && !node->is_constant()) if (!node->is_parameter() && !node->is_constant())
{
if (m_emit_timing)
{ {
emit_debug_function_exit(writer, node.get(), in, out); emit_debug_function_exit(node.get());
} }
} }
emit_temp_mem_pool_release();
} }
if (temporaries_used) m_writer.block_end(); // End generated function
{
writer << "ngraph::runtime::gpu::free_gpu_buffer(pool_base_ptr);\n";
}
writer.indent--;
// End generated function
writer += "}\n\n";
} }
}
// allocate device buffers for primitive arguments and workspace void runtime::gpu::GPU_ExternalFunction::store_emitted_functions(const string& code)
m_primitive_emitter->allocate_primitive_memory(); {
// TODO: Cleanup and make this a utility function // TODO: Cleanup and make this a utility function
string filename = file_util::path_join(s_output_dir, function_name + "_codegen.cpp"); string filename = file_util::path_join(s_output_dir, m_function_name + "_codegen.cpp");
ofstream out(filename); ofstream out(filename);
string code = writer.get_code();
out << code; out << code;
out.close(); out.close();
}
void runtime::gpu::GPU_ExternalFunction::compile()
{
if (m_is_compiled)
{
return;
}
m_primitive_emitter.reset(new GPUPrimitiveEmitter());
m_function_name = m_function->get_name();
string dump_filename = file_util::path_join(s_output_dir, m_function_name + "_ops.txt");
// For now, just make everyone row-major.
m_pass_manager.register_pass<pass::ResultCopyElimination>();
m_pass_manager.register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
m_pass_manager.register_pass<pass::Liveness>();
m_pass_manager.register_pass<pass::MemoryLayout>(64);
m_pass_manager.register_pass<pass::DumpSorted>(dump_filename);
m_pass_manager.run_passes(m_function);
for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
{
m_function_ordered_ops.insert({current_function, current_function->get_ordered_ops()});
}
emit_header();
emit_timer_functions();
emit_constant_declarations();
emit_function_declarations();
collect_unique_functions();
emit_functions();
// allocate device buffers for primitive arguments and workspace
m_primitive_emitter->allocate_primitive_memory();
string code = m_writer.get_code();
store_emitted_functions(code);
m_compiler.reset(new codegen::Compiler()); m_compiler.reset(new codegen::Compiler());
m_execution_engine.reset(new codegen::ExecutionEngine()); m_execution_engine.reset(new codegen::ExecutionEngine());
m_compiler->set_precompiled_header_source(m_pch_header_source);
m_compiler->set_precompiled_header_source(pch_header_source);
auto codegen_module = m_compiler->compile(code); auto codegen_module = m_compiler->compile(code);
if (codegen_module == nullptr) if (codegen_module == nullptr)
{ {
throw runtime_error("Function failed to compile to bitcode"); throw runtime_error("Function failed to compile to bitcode");
} }
m_execution_engine->add_module(codegen_module); m_execution_engine->add_module(codegen_module);
m_execution_engine->finalize(); m_execution_engine->finalize();
m_compiled_function = m_execution_engine->find_function<EntryPoint_t>(function_name);
m_compiled_function = m_execution_engine->find_function<EntryPoint_t>(m_function_name);
if (!m_compiled_function) if (!m_compiled_function)
{ {
throw runtime_error("Function failed to compile"); throw runtime_error("Function failed to compile");
...@@ -769,36 +754,6 @@ using namespace std; ...@@ -769,36 +754,6 @@ using namespace std;
} }
} }
void runtime::gpu::GPU_ExternalFunction::handle_output_alias(
codegen::CodeWriter& writer,
const Node& node,
const unordered_map<descriptor::TensorView*, vector<size_t>>& output_alias_map)
{
for (const descriptor::Output& output : node.get_outputs())
{
shared_ptr<descriptor::TensorView> otv = output.get_tensor_view();
auto it = output_alias_map.find(otv.get());
if (it != output_alias_map.end())
{
const vector<size_t>& outputs = it->second;
if (outputs.size() > 1)
{
writer << "{ // handle output alias for previous op\n";
writer.indent++;
for (size_t i = 1; i < outputs.size(); i++)
{
writer << "ngraph::runtime::gpu::cuda_memcpyDtD(static_cast<void*>("
"outputs["
<< outputs[i] << "]), static_cast<void*>(outputs[" << outputs[0]
<< "]), " << otv->get_tensor().size() << ");\n";
}
writer.indent--;
writer << "}\n";
}
}
}
}
shared_ptr<ngraph::runtime::gpu::GPU_CallFrame> shared_ptr<ngraph::runtime::gpu::GPU_CallFrame>
runtime::gpu::GPU_ExternalFunction::make_call_frame() runtime::gpu::GPU_ExternalFunction::make_call_frame()
{ {
...@@ -810,35 +765,27 @@ shared_ptr<ngraph::runtime::gpu::GPU_CallFrame> ...@@ -810,35 +765,27 @@ shared_ptr<ngraph::runtime::gpu::GPU_CallFrame>
return make_shared<GPU_CallFrame>(shared_from_this(), m_compiled_function); return make_shared<GPU_CallFrame>(shared_from_this(), m_compiled_function);
} }
void runtime::gpu::GPU_ExternalFunction::emit_debug_function_entry( void runtime::gpu::GPU_ExternalFunction::emit_debug_function_entry(Node* node)
codegen::CodeWriter& writer,
Node* node,
const std::vector<GPU_TensorViewWrapper>& in,
const std::vector<GPU_TensorViewWrapper>& out)
{ {
writer << "timers[" << m_name_index_map[node->get_name()] << "].start();\n"; if (m_emit_timing)
{
m_writer << "timers[" << m_name_index_map[node->get_name()] << "].start();\n";
}
} }
void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit( void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit(Node* node)
codegen::CodeWriter& writer,
Node* node,
const std::vector<GPU_TensorViewWrapper>& in,
const std::vector<GPU_TensorViewWrapper>& out)
{ {
writer << "timers[" << m_name_index_map[node->get_name()] << "].stop();\n"; if (m_emit_timing)
{
m_writer << "timers[" << m_name_index_map[node->get_name()] << "].stop();\n";
}
} }
std::unique_ptr<runtime::gpu::GPURuntimeContext>& runtime::gpu::GPU_ExternalFunction::ctx() unique_ptr<runtime::gpu::GPURuntimeContext>& runtime::gpu::GPU_ExternalFunction::ctx()
{ {
return m_ctx; return m_ctx;
} }
bool runtime::gpu::GPU_ExternalFunction::is_functionally_identical(
const Node& n1, const Node& n2, const unordered_map<const Node*, string>& node_cache) const
{
return node_cache.at(&n1) == node_cache.at(&n2);
}
string runtime::gpu::GPU_ExternalFunction::emit_op_as_function(const Node& node, string runtime::gpu::GPU_ExternalFunction::emit_op_as_function(const Node& node,
const string& function_name) const string& function_name)
{ {
......
...@@ -26,6 +26,12 @@ ...@@ -26,6 +26,12 @@
#include "ngraph/codegen/compiler.hpp" #include "ngraph/codegen/compiler.hpp"
#include "ngraph/codegen/execution_engine.hpp" #include "ngraph/codegen/execution_engine.hpp"
#include "ngraph/function.hpp" #include "ngraph/function.hpp"
#include "ngraph/pass/assign_layout.hpp"
#include "ngraph/pass/dump_sorted.hpp"
#include "ngraph/pass/liveness.hpp"
#include "ngraph/pass/manager.hpp"
#include "ngraph/pass/memory_layout.hpp"
#include "ngraph/pass/result_copy_elimination.hpp"
#include "ngraph/runtime/gpu/gpu_call_frame.hpp" #include "ngraph/runtime/gpu/gpu_call_frame.hpp"
#include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp" #include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp" #include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp"
...@@ -58,6 +64,7 @@ namespace ngraph ...@@ -58,6 +64,7 @@ namespace ngraph
GPU_ExternalFunction(const std::shared_ptr<ngraph::Function>& function, GPU_ExternalFunction(const std::shared_ptr<ngraph::Function>& function,
bool release_function = true); bool release_function = true);
~GPU_ExternalFunction(); ~GPU_ExternalFunction();
std::shared_ptr<ngraph::runtime::gpu::GPU_CallFrame> make_call_frame(); std::shared_ptr<ngraph::runtime::gpu::GPU_CallFrame> make_call_frame();
std::unique_ptr<runtime::gpu::GPURuntimeContext>& ctx(); std::unique_ptr<runtime::gpu::GPURuntimeContext>& ctx();
const std::unique_ptr<GPUPrimitiveEmitter>& get_primitive_emitter() const const std::unique_ptr<GPUPrimitiveEmitter>& get_primitive_emitter() const
...@@ -71,39 +78,46 @@ namespace ngraph ...@@ -71,39 +78,46 @@ namespace ngraph
EntryPoint m_compiled_function; EntryPoint m_compiled_function;
private: private:
void emit_debug_function_entry(codegen::CodeWriter& writer, void collect_unique_functions();
Node* node, void emit_header();
const std::vector<GPU_TensorViewWrapper>& in, void emit_timer_functions();
const std::vector<GPU_TensorViewWrapper>& out); void emit_constant_declarations();
void emit_debug_function_exit(codegen::CodeWriter& writer, void emit_function_declarations();
Node* node, void emit_functions();
const std::vector<GPU_TensorViewWrapper>& in, void emit_debug_function_entry(Node* node);
const std::vector<GPU_TensorViewWrapper>& out); void emit_debug_function_exit(Node* node);
void handle_output_alias( void emit_temp_mem_pool_allocation(std::shared_ptr<Function> current_function);
codegen::CodeWriter& writer, void emit_temp_mem_pool_release();
const Node&,
const std::unordered_map<descriptor::TensorView*, std::vector<size_t>>&);
void release_function() { m_function = nullptr; } void release_function() { m_function = nullptr; }
void store_emitted_functions(const std::string& code);
std::string emit_op_as_function(const Node& node, const std::string& function_name); std::string emit_op_as_function(const Node& node, const std::string& function_name);
std::string strip_comments(const std::string& s) const; std::string strip_comments(const std::string& s) const;
bool is_functionally_identical(
const Node& n1, codegen::CodeWriter m_writer;
const Node& n2, pass::Manager m_pass_manager;
const std::unordered_map<const Node*, std::string>& node_cache) const;
std::unique_ptr<codegen::Compiler> m_compiler; std::unique_ptr<codegen::Compiler> m_compiler;
std::unique_ptr<codegen::ExecutionEngine> m_execution_engine; std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
bool m_emit_timing; std::unique_ptr<GPUPrimitiveEmitter> m_primitive_emitter;
std::unordered_map<std::string, std::string> m_variable_name_map; std::unique_ptr<GPURuntimeContext> m_ctx;
std::map<std::string, size_t> m_name_index_map;
std::shared_ptr<ngraph::Function> m_function; std::shared_ptr<ngraph::Function> m_function;
bool m_release_function;
std::map<std::string, size_t> m_name_index_map;
std::unordered_map<std::string, std::string> m_variable_name_map;
std::unordered_map<const Node*, std::string> m_node_function_map;
std::unordered_map<std::shared_ptr<Function>, std::list<std::shared_ptr<Node>>>
m_function_ordered_ops;
bool m_emit_timing;
bool m_is_compiled; bool m_is_compiled;
bool m_release_function;
bool m_temporaries_used;
std::string m_function_name;
std::string m_pch_header_source;
cublasHandle_t m_cublas_handle; cublasHandle_t m_cublas_handle;
cudnnHandle_t m_cudnn_handle; cudnnHandle_t m_cudnn_handle;
std::unique_ptr<GPUPrimitiveEmitter> m_primitive_emitter;
std::unique_ptr<GPURuntimeContext> m_ctx;
}; };
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment