Commit b69f0734 authored by Fenglei's avatar Fenglei Committed by Nick Korovaiko

add gpu timer (#1143)

* add gpu_timer to external function

* compiled version

* working version

* using block_begin and block_end

* add the missing '
;'
parent 4db318a3
...@@ -48,13 +48,13 @@ shared_ptr<runtime::TensorView> runtime::gpu::GPU_Backend::create_tensor( ...@@ -48,13 +48,13 @@ shared_ptr<runtime::TensorView> runtime::gpu::GPU_Backend::create_tensor(
bool runtime::gpu::GPU_Backend::compile(shared_ptr<Function> func) bool runtime::gpu::GPU_Backend::compile(shared_ptr<Function> func)
{ {
if (!contains_key(m_function_map, func)) FunctionInstance& instance = m_function_map[func];
if (instance.m_external_function == nullptr)
{ {
FunctionInstance instance;
instance.m_external_function = make_shared<GPU_ExternalFunction>(func); instance.m_external_function = make_shared<GPU_ExternalFunction>(func);
instance.m_external_function->m_emit_timing = instance.m_performance_counters_enabled;
auto cf = instance.m_external_function->make_call_frame(); auto cf = instance.m_external_function->make_call_frame();
instance.m_call_frame = dynamic_pointer_cast<GPU_CallFrame>(cf); instance.m_call_frame = dynamic_pointer_cast<GPU_CallFrame>(cf);
m_function_map.insert({func, instance});
} }
return true; return true;
} }
...@@ -67,20 +67,62 @@ bool runtime::gpu::GPU_Backend::call(shared_ptr<Function> func, ...@@ -67,20 +67,62 @@ bool runtime::gpu::GPU_Backend::call(shared_ptr<Function> func,
validate_call(func, outputs, inputs); validate_call(func, outputs, inputs);
auto it = m_function_map.find(func); FunctionInstance& instance = m_function_map[func];
if (it == m_function_map.end()) if (instance.m_external_function == nullptr)
{ {
compile(func); rc = compile(func);
it = m_function_map.find(func);
} }
if (it == m_function_map.end()) instance.m_call_frame->call(outputs, inputs);
return rc;
}
void runtime::gpu::GPU_Backend::remove_compiled_function(shared_ptr<Function> func)
{
m_function_map.erase(func);
}
void runtime::gpu::GPU_Backend::enable_performance_data(shared_ptr<Function> func, bool enable)
{
FunctionInstance& instance = m_function_map[func];
if (instance.m_external_function != nullptr)
{ {
throw runtime_error("Error constructing backend."); throw runtime_error("Performance data collection must be enabled prior to compiling.");
} }
instance.m_performance_counters_enabled = enable;
}
FunctionInstance& instance = it->second; vector<runtime::PerformanceCounter>
instance.m_call_frame->call(outputs, inputs); runtime::gpu::GPU_Backend::get_performance_data(shared_ptr<Function> func) const
{
std::vector<runtime::PerformanceCounter> rc;
auto it = m_function_map.find(func);
if (it != m_function_map.end())
{
const FunctionInstance& instance = it->second;
if (instance.m_external_function != nullptr)
{
auto* engine = instance.m_external_function->m_execution_engine.get();
if (engine)
{
auto get_count = engine->find_function<size_t()>("get_debug_timer_count");
auto get_name = engine->find_function<const char*(size_t)>("get_debug_timer_name");
auto get_microseconds =
engine->find_function<size_t(size_t)>("get_debug_timer_microseconds");
auto get_call_count =
engine->find_function<size_t(size_t)>("get_debug_timer_call_count");
if (get_count && get_name && get_microseconds && get_call_count)
{
size_t count = get_count();
for (size_t i = 0; i < count; i++)
{
rc.push_back({get_name(i), get_microseconds(i), get_call_count(i)});
}
}
}
}
}
return rc; return rc;
} }
...@@ -54,12 +54,18 @@ namespace ngraph ...@@ -54,12 +54,18 @@ namespace ngraph
const std::vector<std::shared_ptr<runtime::TensorView>>& outputs, const std::vector<std::shared_ptr<runtime::TensorView>>& outputs,
const std::vector<std::shared_ptr<runtime::TensorView>>& inputs) override; const std::vector<std::shared_ptr<runtime::TensorView>>& inputs) override;
void remove_compiled_function(std::shared_ptr<Function> func) override;
void enable_performance_data(std::shared_ptr<Function> func, bool enable) override;
std::vector<PerformanceCounter>
get_performance_data(std::shared_ptr<Function> func) const override;
private: private:
class FunctionInstance class FunctionInstance
{ {
public: public:
std::shared_ptr<GPU_ExternalFunction> m_external_function; std::shared_ptr<GPU_ExternalFunction> m_external_function;
std::shared_ptr<GPU_CallFrame> m_call_frame; std::shared_ptr<GPU_CallFrame> m_call_frame;
bool m_performance_counters_enabled = false;
}; };
std::map<std::shared_ptr<Function>, FunctionInstance> m_function_map; std::map<std::shared_ptr<Function>, FunctionInstance> m_function_map;
......
...@@ -252,11 +252,10 @@ static const runtime::gpu::OpMap dispatcher{ ...@@ -252,11 +252,10 @@ static const runtime::gpu::OpMap dispatcher{
runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction( runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
const shared_ptr<ngraph::Function>& function, bool release_function) const shared_ptr<ngraph::Function>& function, bool release_function)
: m_compiled_function(nullptr) : m_compiled_function(nullptr)
, m_emit_timing(std::getenv("NGRAPH_GPU_EMIT_TIMING") != nullptr) , m_emit_timing(false)
, m_function(function) , m_function(function)
, m_release_function(release_function) , m_release_function(release_function)
, m_is_compiled(false) , m_is_compiled(false)
, m_timing(false)
, m_ctx(new GPURuntimeContext) , m_ctx(new GPURuntimeContext)
{ {
// Create context use driver API and make it current, the runtime call will pickup the context // Create context use driver API and make it current, the runtime call will pickup the context
...@@ -311,6 +310,12 @@ void runtime::gpu::GPU_ExternalFunction::compile() ...@@ -311,6 +310,12 @@ void runtime::gpu::GPU_ExternalFunction::compile()
pass_manager.register_pass<pass::DumpSorted>(dump_filename); pass_manager.register_pass<pass::DumpSorted>(dump_filename);
pass_manager.run_passes(m_function); pass_manager.run_passes(m_function);
unordered_map<shared_ptr<Function>, list<shared_ptr<Node>>> function_ordered_ops;
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
{
function_ordered_ops.insert({current_function, current_function->get_ordered_ops()});
}
codegen::CodeWriter writer; codegen::CodeWriter writer;
writer += writer +=
...@@ -356,6 +361,52 @@ using namespace std; ...@@ -356,6 +361,52 @@ using namespace std;
// to register cleanup handlers. We use it, and not atexit(), because // to register cleanup handlers. We use it, and not atexit(), because
// atexit() happens too late, when the JIT is no longer alive // atexit() happens too late, when the JIT is no longer alive
writer << "void *__dso_handle = 0;\n\n"; writer << "void *__dso_handle = 0;\n\n";
if (m_emit_timing)
{
writer << "// Declare debug timers\n";
vector<string> names;
size_t index = 0;
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
{
for (shared_ptr<Node> node : function_ordered_ops.at(current_function))
{
if (!node->is_parameter() && !node->is_constant())
{
names.push_back(node->get_name());
m_name_index_map.insert({node->get_name(), index++});
}
}
}
writer << "ngraph::stopwatch timers[" << names.size() << "];\n";
writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
<< "; }\n";
writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
writer.block_begin();
writer << "static const char* timer_names[" << names.size() << "] =\n";
writer.block_begin();
vector<string> quoted_names;
for (const string& name : names)
{
quoted_names.push_back("\"" + name + "\"");
}
writer << emit_string_array(quoted_names, 100 - (4 * 2 + 1));
writer.indent--;
writer << "\n};\n";
writer << "return timer_names[index];\n";
writer.block_end();
writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
writer.block_begin();
writer << "return (index < " << names.size()
<< " ? timers[index].get_total_microseconds() : 0);\n";
writer.block_end();
writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
writer.block_begin();
writer << "return (index < " << names.size() << " ? timers[index].get_call_count() : 0);\n";
writer.block_end();
writer << "\n";
}
writer << "// Declare all constants\n"; writer << "// Declare all constants\n";
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions()) for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
{ {
...@@ -769,7 +820,7 @@ void runtime::gpu::GPU_ExternalFunction::emit_debug_function_entry( ...@@ -769,7 +820,7 @@ void runtime::gpu::GPU_ExternalFunction::emit_debug_function_entry(
const std::vector<GPU_TensorViewWrapper>& in, const std::vector<GPU_TensorViewWrapper>& in,
const std::vector<GPU_TensorViewWrapper>& out) const std::vector<GPU_TensorViewWrapper>& out)
{ {
writer << "timer_" << node->get_name() << ".start();\n"; writer << "timers[" << m_name_index_map[node->get_name()] << "].start();\n";
} }
void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit( void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit(
...@@ -778,7 +829,7 @@ void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit( ...@@ -778,7 +829,7 @@ void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit(
const std::vector<GPU_TensorViewWrapper>& in, const std::vector<GPU_TensorViewWrapper>& in,
const std::vector<GPU_TensorViewWrapper>& out) const std::vector<GPU_TensorViewWrapper>& out)
{ {
writer << "timer_" << node->get_name() << ".stop();\n"; writer << "timers[" << m_name_index_map[node->get_name()] << "].stop();\n";
} }
std::unique_ptr<runtime::gpu::GPURuntimeContext>& runtime::gpu::GPU_ExternalFunction::ctx() std::unique_ptr<runtime::gpu::GPURuntimeContext>& runtime::gpu::GPU_ExternalFunction::ctx()
......
...@@ -52,6 +52,7 @@ namespace ngraph ...@@ -52,6 +52,7 @@ namespace ngraph
class GPU_ExternalFunction : public std::enable_shared_from_this<GPU_ExternalFunction> class GPU_ExternalFunction : public std::enable_shared_from_this<GPU_ExternalFunction>
{ {
friend class GPU_CallFrame; friend class GPU_CallFrame;
friend class GPU_Backend;
public: public:
GPU_ExternalFunction(const std::shared_ptr<ngraph::Function>& function, GPU_ExternalFunction(const std::shared_ptr<ngraph::Function>& function,
...@@ -94,10 +95,10 @@ namespace ngraph ...@@ -94,10 +95,10 @@ namespace ngraph
std::unique_ptr<codegen::ExecutionEngine> m_execution_engine; std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
bool m_emit_timing; bool m_emit_timing;
std::unordered_map<std::string, std::string> m_variable_name_map; std::unordered_map<std::string, std::string> m_variable_name_map;
std::map<std::string, size_t> m_name_index_map;
std::shared_ptr<ngraph::Function> m_function; std::shared_ptr<ngraph::Function> m_function;
bool m_release_function; bool m_release_function;
bool m_is_compiled; bool m_is_compiled;
bool m_timing;
cublasHandle_t m_cublas_handle; cublasHandle_t m_cublas_handle;
cudnnHandle_t m_cudnn_handle; cudnnHandle_t m_cudnn_handle;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment