Commit 260cb90d authored by Fenglei's avatar Fenglei Committed by Scott Cyphers

gpu_external_function and gpu constant memory refactor (#1189)

* refactor external function

* wokring version

* fix bug

* add emit_fucntions, emit_declare_constants, emit_declare_functions

* add std::

* add functions declaration

* fix bugs

* fix bugs

* separate temp memory allocation and release

* add invoke_constant_ptr function, clean up outputs for function

* fix bugs, compiled ok

* add ctx to emit_declare_constant

* cleanup code, code style

* remove using std, code style

* revert std changes

* change function names based Chris's comments

* add ResultCopyElimination to pass_manager

* clang format
parent 2c345798
......@@ -20,16 +20,9 @@
#include <cuda_runtime.h>
#include <cudnn.h>
#include <fstream>
#include <memory>
#include <string>
#include <tuple>
#include <typeindex>
#include <typeinfo>
#include <unordered_map>
#include "ngraph/codegen/code_writer.hpp"
#include "ngraph/codegen/compiler.hpp"
#include "ngraph/codegen/execution_engine.hpp"
#include "ngraph/descriptor/input.hpp"
#include "ngraph/descriptor/layout/dense_tensor_view_layout.hpp"
#include "ngraph/descriptor/output.hpp"
......@@ -104,13 +97,7 @@
#include "ngraph/op/sum.hpp"
#include "ngraph/op/tan.hpp"
#include "ngraph/op/tanh.hpp"
#include "ngraph/pass/assign_layout.hpp"
#include "ngraph/pass/dump_sorted.hpp"
#include "ngraph/pass/liveness.hpp"
#include "ngraph/pass/manager.hpp"
#include "ngraph/pass/memory_layout.hpp"
#include "ngraph/runtime/gpu/gpu_backend.hpp"
#include "ngraph/runtime/gpu/gpu_call_frame.hpp"
#include "ngraph/runtime/gpu/gpu_cuda_kernel_emitters.hpp"
#include "ngraph/runtime/gpu/gpu_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_external_function.hpp"
......@@ -252,11 +239,12 @@ static const runtime::gpu::OpMap dispatcher{
runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
const shared_ptr<ngraph::Function>& function, bool release_function)
: m_compiled_function(nullptr)
, m_emit_timing(false)
, m_ctx(new GPURuntimeContext)
, m_function(function)
, m_release_function(release_function)
, m_emit_timing(false)
, m_is_compiled(false)
, m_ctx(new GPURuntimeContext)
, m_release_function(release_function)
, m_temporaries_used(false)
{
// Create context use driver API and make it current, the runtime call will pickup the context
// http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
......@@ -289,37 +277,10 @@ runtime::gpu::GPU_ExternalFunction::~GPU_ExternalFunction()
delete m_ctx->compiled_kernel_pool;
}
void runtime::gpu::GPU_ExternalFunction::compile()
void runtime::gpu::GPU_ExternalFunction::emit_header()
{
if (m_is_compiled)
{
return;
}
m_primitive_emitter.reset(new GPUPrimitiveEmitter());
string function_name = m_function->get_name();
string dump_filename = file_util::path_join(s_output_dir, function_name + "_ops.txt");
pass::Manager pass_manager;
// pass_manager.register_pass<pass::TopologicalSort>();
// For now, just make everyone row-major.
pass_manager.register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
pass_manager.register_pass<pass::Liveness>();
pass_manager.register_pass<pass::MemoryLayout>(64);
pass_manager.register_pass<pass::DumpSorted>(dump_filename);
pass_manager.run_passes(m_function);
unordered_map<shared_ptr<Function>, list<shared_ptr<Node>>> function_ordered_ops;
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
{
function_ordered_ops.insert({current_function, current_function->get_ordered_ops()});
}
codegen::CodeWriter writer;
writer +=
R"(// Generated by the nGraph GPU backend
m_writer += R"(
// Generated by the nGraph GPU backend
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_runtime.h>
......@@ -348,9 +309,9 @@ void runtime::gpu::GPU_ExternalFunction::compile()
#include "ngraph/util.hpp"
)";
string pch_header_source = writer.get_code();
m_pch_header_source = m_writer.get_code();
writer += R"(
m_writer += R"(
using namespace ngraph;
using namespace ngraph::runtime;
using namespace std;
......@@ -360,15 +321,19 @@ using namespace std;
// which is enabled because the JIT uses it as the default mechanism
// to register cleanup handlers. We use it, and not atexit(), because
// atexit() happens too late, when the JIT is no longer alive
writer << "void *__dso_handle = 0;\n\n";
m_writer << "void *__dso_handle = 0;\n\n";
}
void runtime::gpu::GPU_ExternalFunction::emit_timer_functions()
{
if (m_emit_timing)
{
writer << "// Declare debug timers\n";
m_writer << "// Declare debug timers\n";
vector<string> names;
size_t index = 0;
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
{
for (shared_ptr<Node> node : function_ordered_ops.at(current_function))
for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
{
if (!node->is_parameter() && !node->is_constant())
{
......@@ -377,77 +342,114 @@ using namespace std;
}
}
}
writer << "ngraph::stopwatch timers[" << names.size() << "];\n";
writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
m_writer << "ngraph::stopwatch timers[" << names.size() << "];\n";
m_writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
<< "; }\n";
writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
writer.block_begin();
writer << "static const char* timer_names[" << names.size() << "] =\n";
writer.block_begin();
m_writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
m_writer.block_begin();
m_writer << "static const char* timer_names[" << names.size() << "] =\n";
m_writer.block_begin();
vector<string> quoted_names;
for (const string& name : names)
{
quoted_names.push_back("\"" + name + "\"");
}
writer << emit_string_array(quoted_names, 100 - (4 * 2 + 1));
writer.indent--;
writer << "\n};\n";
writer << "return timer_names[index];\n";
writer.block_end();
m_writer << emit_string_array(quoted_names, 100 - (4 * 2 + 1));
m_writer.indent--;
m_writer << "\n};\n";
m_writer << "return timer_names[index];\n";
m_writer.block_end();
writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
writer.block_begin();
writer << "return (index < " << names.size()
m_writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
m_writer.block_begin();
m_writer << "return (index < " << names.size()
<< " ? timers[index].get_total_microseconds() : 0);\n";
writer.block_end();
m_writer.block_end();
writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
writer.block_begin();
writer << "return (index < " << names.size() << " ? timers[index].get_call_count() : 0);\n";
writer.block_end();
writer << "\n";
m_writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
m_writer.block_begin();
m_writer << "return (index < " << names.size()
<< " ? timers[index].get_call_count() : 0);\n";
m_writer.block_end();
m_writer << "\n";
}
writer << "// Declare all constants\n";
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
}
void runtime::gpu::GPU_ExternalFunction::emit_constant_declarations()
{
m_writer << "// Declare all constants\n";
for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
{
for (shared_ptr<Node> node : current_function->get_ordered_ops())
for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
{
const op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
if (c)
{
shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
auto c_value_strings = c->get_value_strings();
writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " *"
<< tv->get_tensor().get_name() << ";\n";
writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " "
<< tv->get_tensor().get_name() << "_cpu[" << c_value_strings.size()
<< "] =\n";
writer << "{\n";
writer.indent++;
writer << emit_string_array(c_value_strings, 100 - writer.indent * 4);
writer.indent--;
writer << "\n};\n\n";
// get an allocator for transient per kernel gpu memory
GPUAllocator allocator = this->m_primitive_emitter->get_memory_allocator();
size_t idx = allocator.reserve_argspace(
c->get_data_ptr(),
tv->get_tensor().size() * tv->get_tensor().get_element_type().size());
m_writer << "static size_t " << tv->get_tensor().get_name() << "_idx = " << idx
<< ";\n";
m_writer << "static " << tv->get_tensor().get_element_type().c_type_string() << "* "
<< tv->get_tensor().get_name() << " = nullptr;\n";
m_variable_name_map[tv->get_tensor().get_name()] = tv->get_tensor().get_name();
}
}
}
writer << "// Declare all functions\n";
for (shared_ptr<Function> f : pass_manager.get_state().get_functions())
m_writer << "\nstatic bool is_constant_mem_ptr_null = true;\n\n";
m_writer << "static void invoke_constant_mem_ptr(gpu::GPURuntimeContext* ctx)\n";
m_writer.block_begin();
{
m_writer << "if(is_constant_mem_ptr_null)\n";
m_writer.block_begin();
{
for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
{
writer << "extern \"C\" void " << f->get_name() << "(void** inputs, void** outputs, "
for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
{
const op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
if (c)
{
shared_ptr<descriptor::TensorView> tv =
node->get_outputs()[0].get_tensor_view();
m_writer << tv->get_tensor().get_name() << " = reinterpret_cast<"
<< tv->get_tensor().get_element_type().c_type_string()
<< "*>(runtime::gpu::invoke_memory_primitive(ctx, "
<< tv->get_tensor().get_name() << "_idx));\n";
}
}
}
m_writer << "is_constant_mem_ptr_null = false;\n";
}
m_writer.block_end();
}
m_writer.block_end();
}
void runtime::gpu::GPU_ExternalFunction::emit_function_declarations()
{
m_writer << "// Declare all functions\n";
for (shared_ptr<Function> f : m_pass_manager.get_state().get_functions())
{
m_writer << "extern \"C\" void " << f->get_name() << "(void** inputs, void** outputs, "
<< "gpu::GPURuntimeContext* ctx);\n";
}
writer << "\n";
m_writer << "\n";
}
void runtime::gpu::GPU_ExternalFunction::collect_unique_functions()
{
// This for loop creates a collection of functions that are called more than once
// and emitting them as globally callable functions.
// ops implement the is_functionally_identical method
unordered_map<string, string> match_function_map;
unordered_map<const Node*, string> node_function_map;
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
{
list<shared_ptr<Node>> tmp = current_function->get_ordered_ops();
list<shared_ptr<Node>> tmp = m_function_ordered_ops.at(current_function);
if (tmp.size() < 2)
{
// Since we are comparing ops there must be at least two ops to proceed.
......@@ -481,80 +483,39 @@ using namespace std;
match_function_name = "func_" + node.get_name();
emitted_function.replace(offset, 5, match_function_name);
match_function_map.insert({match_function, match_function_name});
writer << emitted_function << "\n";
}
node_function_map.insert({&node, match_function_name});
}
m_writer << emitted_function << "\n";
}
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
{
set<string> output_names;
for (shared_ptr<Node> op : current_function->get_results())
{
shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
output_names.insert(tv->get_tensor().get_name());
}
set<descriptor::TensorView*> constants;
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
if (dynamic_cast<ngraph::op::Constant*>(node.get()))
{
shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
constants.insert(tv.get());
}
}
writer << "extern \"C\" void " << current_function->get_name();
writer << "(void** inputs, void** outputs, "
<< "gpu::GPURuntimeContext* ctx)\n";
writer << "{\n";
writer.indent++;
for (shared_ptr<Node> node : current_function->get_ordered_ops())
{
const op::Constant* c = dynamic_cast<op::Constant*>(node.get());
if (c)
{
shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
writer << "if(" << tv->get_tensor().get_name() << " == NULL)\n";
writer << "{\n";
writer.indent++;
writer << tv->get_tensor().get_name() << " = ("
<< tv->get_tensor().get_element_type().c_type_string()
<< " *) runtime::gpu::create_gpu_buffer(" << tv->get_tensor().size()
<< ");\n";
writer << "runtime::gpu::cuda_memcpyHtD(" << tv->get_tensor().get_name() << ", "
<< tv->get_tensor().get_name() << "_cpu, " << tv->get_tensor().size()
<< ");\n";
writer.indent--;
writer << "}\n";
m_node_function_map.insert({&node, match_function_name});
}
}
}
bool temporaries_used = false;
void runtime::gpu::GPU_ExternalFunction::emit_temp_mem_pool_allocation(
shared_ptr<Function> current_function)
{
m_temporaries_used = false;
size_t worst_case_tmp_size = 0;
for (shared_ptr<Node> node : current_function->get_ordered_ops())
for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
{
if (node->liveness_new_list.size() > 0)
{
temporaries_used = true;
m_temporaries_used = true;
for (descriptor::Tensor* tensor : node->liveness_new_list)
{
worst_case_tmp_size += tensor->size();
}
}
}
if (temporaries_used)
if (m_temporaries_used)
{
size_t temp_pool_size = current_function->get_temporary_pool_size();
writer << "// Allocate the memory pool\n";
m_writer << "// Allocate the memory pool\n";
// TODO memory pool malloc.
writer << "void* pool_base_ptr = ngraph::runtime::gpu::create_gpu_buffer("
m_writer << "void* pool_base_ptr = ngraph::runtime::gpu::create_gpu_buffer("
<< temp_pool_size << ");\n";
// Add temporaries to the variable name map
for (shared_ptr<Node> node : current_function->get_ordered_ops())
for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
{
for (descriptor::Tensor* tensor : node->liveness_new_list)
{
......@@ -565,6 +526,46 @@ using namespace std;
}
}
}
}
void runtime::gpu::GPU_ExternalFunction::emit_temp_mem_pool_release()
{
if (m_temporaries_used)
{
m_writer << "ngraph::runtime::gpu::free_gpu_buffer(pool_base_ptr);\n";
}
}
void runtime::gpu::GPU_ExternalFunction::emit_functions()
{
for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
{
set<string> output_names;
for (shared_ptr<Node> op : current_function->get_results())
{
shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
output_names.insert(tv->get_tensor().get_name());
}
set<descriptor::TensorView*> constants;
for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
{
if (dynamic_cast<ngraph::op::Constant*>(node.get()))
{
shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
constants.insert(tv.get());
}
}
m_writer << "extern \"C\" void " << current_function->get_name();
m_writer << "(void** inputs, void** outputs, "
<< "gpu::GPURuntimeContext* ctx)\n";
m_writer.block_begin();
{
//set constant pointers during the first run
m_writer << "invoke_constant_mem_ptr(ctx);\n";
//alocate temp memory pool
emit_temp_mem_pool_allocation(current_function);
// Add inputs to the variable name map
size_t arg_index = 0;
......@@ -582,74 +583,37 @@ using namespace std;
}
}
// create output alias map
size_t output_index = 0;
unordered_map<descriptor::TensorView*, vector<size_t>> output_alias_map;
vector<size_t> aliases;
for (size_t i = 0; i < current_function->get_output_size(); ++i)
{
shared_ptr<Node> op = current_function->get_output_op(i);
shared_ptr<descriptor::TensorView> otv = op->get_output_tensor_view();
vector<size_t>& al = output_alias_map[otv.get()];
al.push_back(output_index);
if (al.size() > 1)
{
aliases.push_back(output_index);
}
output_index++;
}
// Add outputs to the variable name map
output_index = 0;
for (size_t i = 0; i < current_function->get_output_size(); ++i)
{
shared_ptr<Node> op = current_function->get_output_op(i);
shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
const element::Type& et = tv->get_tensor_view_type()->get_element_type();
bool parameter_as_output = false;
for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
{
for (const descriptor::Output& pout : param->get_outputs())
{
shared_ptr<descriptor::TensorView> ptv = pout.get_tensor_view();
if (tv == ptv)
{
parameter_as_output = true;
writer << "ngraph::runtime::gpu::cuda_memcpyDtD(reinterpret_cast<"
<< et.c_type_string() << "*>(outputs[" << output_index << "]), "
<< m_variable_name_map[ptv->get_tensor().get_name()] << ", "
<< ptv->get_tensor().size() << ");\n";
break;
}
}
}
if (!parameter_as_output && !contains(aliases, output_index))
{
if (contains(constants, tv.get()))
{
writer << "ngraph::runtime::gpu::cuda_memcpyHtD(outputs[" << output_index
<< "], " << tv->get_tensor().get_name() << ", "
<< tv->get_tensor().size() << ");\n";
}
else
{
string type = et.c_type_string();
string type = tv->get_tensor_view_type()->get_element_type().c_type_string();
stringstream ss;
ss << "((" << type << "*)(outputs[" << output_index << "]))";
ss << "((" << type << "*)(outputs[" << i << "]))";
m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
//it should be safe to assign both descriptors to one output*
//since needs_copy == false makes `op::Result` an nop
auto res = dynamic_pointer_cast<ngraph::op::Result>(op);
if (!res->needs_copy())
{
shared_ptr<descriptor::TensorView> itv =
res->get_inputs().at(0).get_output().get_tensor_view();
m_variable_name_map[itv->get_tensor().get_name()] = ss.str();
}
}
output_index++;
}
for (shared_ptr<Node> node : current_function->get_ordered_ops())
for (shared_ptr<Node> node : m_function_ordered_ops.at(current_function))
{
auto& n = *node; // Work around a compiler warning (*node inside typeid may have effects
auto& n =
*node; // Work around a compiler warning (*node inside typeid may have effects
// with shared pointers, which is fine here but clang doesn't like it.)
auto handler = dispatcher.find(type_index(typeid(n)));
if (handler == dispatcher.end())
{
throw ngraph_error("Unhandled op during code generation : " + node->description());
throw ngraph_error("Unhandled op during code generation : " +
node->description());
}
vector<GPU_TensorViewWrapper> in;
vector<string> node_input_names;
......@@ -658,46 +622,38 @@ using namespace std;
{
const descriptor::Output& output = input.get_output();
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
in.push_back(
GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
in.push_back(GPU_TensorViewWrapper(
tv, m_variable_name_map[tv->get_tensor().get_name()]));
node_input_names.emplace_back(tv->get_tensor().get_name());
}
vector<GPU_TensorViewWrapper> out;
for (const descriptor::Output& output : node->get_outputs())
{
shared_ptr<descriptor::TensorView> tv = output.get_tensor_view();
out.push_back(
GPU_TensorViewWrapper(tv, m_variable_name_map[tv->get_tensor().get_name()]));
out.push_back(GPU_TensorViewWrapper(
tv, m_variable_name_map[tv->get_tensor().get_name()]));
node_output_names.emplace_back(tv->get_tensor().get_name());
}
// Emit function description comment
if (!node->is_parameter() && !node->is_constant())
{
writer << "\n// " << node->get_name() << "(";
m_writer << "\n// " << node->get_name() << "(";
vector<string> parameter_nodes = node_input_names;
parameter_nodes.insert(
parameter_nodes.end(), node_output_names.begin(), node_output_names.end());
writer << join(parameter_nodes);
writer << ")\n";
}
// Emit operation prologue
if (!node->is_parameter() && !node->is_constant())
{
if (m_emit_timing)
{
emit_debug_function_entry(writer, node.get(), in, out);
}
m_writer << join(parameter_nodes);
m_writer << ")\n";
emit_debug_function_entry(node.get());
}
// Emit operation body
string func_name;
func_name = node_function_map[node.get()];
func_name = m_node_function_map[node.get()];
if (func_name.empty())
{
//throw runtime_error("No matching function found for '" + node->get_name() + "'");
handler->second(this, writer, node.get(), in, out);
handler->second(this, m_writer, node.get(), in, out);
}
else
{
......@@ -711,52 +667,81 @@ using namespace std;
names.push_back(tv.get_name());
}
names.push_back("ctx");
writer << func_name << "(" << join(names) << ");\n";
m_writer << func_name << "(" << join(names) << ");\n";
}
// Emit operation epilogue
if (!node->is_parameter() && !node->is_constant())
{
if (m_emit_timing)
{
emit_debug_function_exit(writer, node.get(), in, out);
emit_debug_function_exit(node.get());
}
}
emit_temp_mem_pool_release();
}
m_writer.block_end(); // End generated function
}
if (temporaries_used)
}
void runtime::gpu::GPU_ExternalFunction::store_emitted_functions(const string& code)
{
// TODO: Cleanup and make this a utility function
string filename = file_util::path_join(s_output_dir, m_function_name + "_codegen.cpp");
ofstream out(filename);
out << code;
out.close();
}
void runtime::gpu::GPU_ExternalFunction::compile()
{
if (m_is_compiled)
{
writer << "ngraph::runtime::gpu::free_gpu_buffer(pool_base_ptr);\n";
return;
}
writer.indent--;
// End generated function
writer += "}\n\n";
m_primitive_emitter.reset(new GPUPrimitiveEmitter());
m_function_name = m_function->get_name();
string dump_filename = file_util::path_join(s_output_dir, m_function_name + "_ops.txt");
// For now, just make everyone row-major.
m_pass_manager.register_pass<pass::ResultCopyElimination>();
m_pass_manager.register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
m_pass_manager.register_pass<pass::Liveness>();
m_pass_manager.register_pass<pass::MemoryLayout>(64);
m_pass_manager.register_pass<pass::DumpSorted>(dump_filename);
m_pass_manager.run_passes(m_function);
for (shared_ptr<Function> current_function : m_pass_manager.get_state().get_functions())
{
m_function_ordered_ops.insert({current_function, current_function->get_ordered_ops()});
}
emit_header();
emit_timer_functions();
emit_constant_declarations();
emit_function_declarations();
collect_unique_functions();
emit_functions();
// allocate device buffers for primitive arguments and workspace
m_primitive_emitter->allocate_primitive_memory();
// TODO: Cleanup and make this a utility function
string filename = file_util::path_join(s_output_dir, function_name + "_codegen.cpp");
ofstream out(filename);
string code = writer.get_code();
out << code;
out.close();
string code = m_writer.get_code();
store_emitted_functions(code);
m_compiler.reset(new codegen::Compiler());
m_execution_engine.reset(new codegen::ExecutionEngine());
m_compiler->set_precompiled_header_source(pch_header_source);
m_compiler->set_precompiled_header_source(m_pch_header_source);
auto codegen_module = m_compiler->compile(code);
if (codegen_module == nullptr)
{
throw runtime_error("Function failed to compile to bitcode");
}
m_execution_engine->add_module(codegen_module);
m_execution_engine->finalize();
m_compiled_function = m_execution_engine->find_function<EntryPoint_t>(function_name);
m_compiled_function = m_execution_engine->find_function<EntryPoint_t>(m_function_name);
if (!m_compiled_function)
{
throw runtime_error("Function failed to compile");
......@@ -769,36 +754,6 @@ using namespace std;
}
}
void runtime::gpu::GPU_ExternalFunction::handle_output_alias(
codegen::CodeWriter& writer,
const Node& node,
const unordered_map<descriptor::TensorView*, vector<size_t>>& output_alias_map)
{
for (const descriptor::Output& output : node.get_outputs())
{
shared_ptr<descriptor::TensorView> otv = output.get_tensor_view();
auto it = output_alias_map.find(otv.get());
if (it != output_alias_map.end())
{
const vector<size_t>& outputs = it->second;
if (outputs.size() > 1)
{
writer << "{ // handle output alias for previous op\n";
writer.indent++;
for (size_t i = 1; i < outputs.size(); i++)
{
writer << "ngraph::runtime::gpu::cuda_memcpyDtD(static_cast<void*>("
"outputs["
<< outputs[i] << "]), static_cast<void*>(outputs[" << outputs[0]
<< "]), " << otv->get_tensor().size() << ");\n";
}
writer.indent--;
writer << "}\n";
}
}
}
}
shared_ptr<ngraph::runtime::gpu::GPU_CallFrame>
runtime::gpu::GPU_ExternalFunction::make_call_frame()
{
......@@ -810,35 +765,27 @@ shared_ptr<ngraph::runtime::gpu::GPU_CallFrame>
return make_shared<GPU_CallFrame>(shared_from_this(), m_compiled_function);
}
void runtime::gpu::GPU_ExternalFunction::emit_debug_function_entry(
codegen::CodeWriter& writer,
Node* node,
const std::vector<GPU_TensorViewWrapper>& in,
const std::vector<GPU_TensorViewWrapper>& out)
void runtime::gpu::GPU_ExternalFunction::emit_debug_function_entry(Node* node)
{
writer << "timers[" << m_name_index_map[node->get_name()] << "].start();\n";
if (m_emit_timing)
{
m_writer << "timers[" << m_name_index_map[node->get_name()] << "].start();\n";
}
}
void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit(
codegen::CodeWriter& writer,
Node* node,
const std::vector<GPU_TensorViewWrapper>& in,
const std::vector<GPU_TensorViewWrapper>& out)
void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit(Node* node)
{
writer << "timers[" << m_name_index_map[node->get_name()] << "].stop();\n";
if (m_emit_timing)
{
m_writer << "timers[" << m_name_index_map[node->get_name()] << "].stop();\n";
}
}
std::unique_ptr<runtime::gpu::GPURuntimeContext>& runtime::gpu::GPU_ExternalFunction::ctx()
unique_ptr<runtime::gpu::GPURuntimeContext>& runtime::gpu::GPU_ExternalFunction::ctx()
{
return m_ctx;
}
bool runtime::gpu::GPU_ExternalFunction::is_functionally_identical(
const Node& n1, const Node& n2, const unordered_map<const Node*, string>& node_cache) const
{
return node_cache.at(&n1) == node_cache.at(&n2);
}
string runtime::gpu::GPU_ExternalFunction::emit_op_as_function(const Node& node,
const string& function_name)
{
......
......@@ -26,6 +26,12 @@
#include "ngraph/codegen/compiler.hpp"
#include "ngraph/codegen/execution_engine.hpp"
#include "ngraph/function.hpp"
#include "ngraph/pass/assign_layout.hpp"
#include "ngraph/pass/dump_sorted.hpp"
#include "ngraph/pass/liveness.hpp"
#include "ngraph/pass/manager.hpp"
#include "ngraph/pass/memory_layout.hpp"
#include "ngraph/pass/result_copy_elimination.hpp"
#include "ngraph/runtime/gpu/gpu_call_frame.hpp"
#include "ngraph/runtime/gpu/gpu_primitive_emitter.hpp"
#include "ngraph/runtime/gpu/gpu_tensor_view_wrapper.hpp"
......@@ -58,6 +64,7 @@ namespace ngraph
GPU_ExternalFunction(const std::shared_ptr<ngraph::Function>& function,
bool release_function = true);
~GPU_ExternalFunction();
std::shared_ptr<ngraph::runtime::gpu::GPU_CallFrame> make_call_frame();
std::unique_ptr<runtime::gpu::GPURuntimeContext>& ctx();
const std::unique_ptr<GPUPrimitiveEmitter>& get_primitive_emitter() const
......@@ -71,39 +78,46 @@ namespace ngraph
EntryPoint m_compiled_function;
private:
void emit_debug_function_entry(codegen::CodeWriter& writer,
Node* node,
const std::vector<GPU_TensorViewWrapper>& in,
const std::vector<GPU_TensorViewWrapper>& out);
void emit_debug_function_exit(codegen::CodeWriter& writer,
Node* node,
const std::vector<GPU_TensorViewWrapper>& in,
const std::vector<GPU_TensorViewWrapper>& out);
void handle_output_alias(
codegen::CodeWriter& writer,
const Node&,
const std::unordered_map<descriptor::TensorView*, std::vector<size_t>>&);
void collect_unique_functions();
void emit_header();
void emit_timer_functions();
void emit_constant_declarations();
void emit_function_declarations();
void emit_functions();
void emit_debug_function_entry(Node* node);
void emit_debug_function_exit(Node* node);
void emit_temp_mem_pool_allocation(std::shared_ptr<Function> current_function);
void emit_temp_mem_pool_release();
void release_function() { m_function = nullptr; }
void store_emitted_functions(const std::string& code);
std::string emit_op_as_function(const Node& node, const std::string& function_name);
std::string strip_comments(const std::string& s) const;
bool is_functionally_identical(
const Node& n1,
const Node& n2,
const std::unordered_map<const Node*, std::string>& node_cache) const;
codegen::CodeWriter m_writer;
pass::Manager m_pass_manager;
std::unique_ptr<codegen::Compiler> m_compiler;
std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
bool m_emit_timing;
std::unordered_map<std::string, std::string> m_variable_name_map;
std::map<std::string, size_t> m_name_index_map;
std::unique_ptr<GPUPrimitiveEmitter> m_primitive_emitter;
std::unique_ptr<GPURuntimeContext> m_ctx;
std::shared_ptr<ngraph::Function> m_function;
bool m_release_function;
std::map<std::string, size_t> m_name_index_map;
std::unordered_map<std::string, std::string> m_variable_name_map;
std::unordered_map<const Node*, std::string> m_node_function_map;
std::unordered_map<std::shared_ptr<Function>, std::list<std::shared_ptr<Node>>>
m_function_ordered_ops;
bool m_emit_timing;
bool m_is_compiled;
bool m_release_function;
bool m_temporaries_used;
std::string m_function_name;
std::string m_pch_header_source;
cublasHandle_t m_cublas_handle;
cudnnHandle_t m_cudnn_handle;
std::unique_ptr<GPUPrimitiveEmitter> m_primitive_emitter;
std::unique_ptr<GPURuntimeContext> m_ctx;
};
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment