add gpu timer (#1143)

* add gpu_timer to external function * compiled version * working version * using block_begin and block_end * add the missing ' ;'

add gpu timer (#1143)
* add gpu_timer to external function * compiled version * working version * using block_begin and block_end * add the missing ' ;'
b69f0734 · Fenglei · Nick Korovaiko · 4db318a3 · b69f0734 · b69f0734
Commit b69f0734 authored Jun 27, 2018 by Fenglei Committed by Nick Korovaiko Jun 27, 2018
4 changed files
--- a/src/ngraph/runtime/gpu/gpu_backend.cpp
+++ b/src/ngraph/runtime/gpu/gpu_backend.cpp
@@ -48,13 +48,13 @@ shared_ptr<runtime::TensorView> runtime::gpu::GPU_Backend::create_tensor(
 bool runtime::gpu::GPU_Backend::compile(shared_ptr<Function> func)
 {
-    if (!contains_key(m_function_map, func))
+    FunctionInstance& instance = m_function_map[func];
+    if (instance.m_external_function == nullptr)
    {
-        FunctionInstance instance;
        instance.m_external_function = make_shared<GPU_ExternalFunction>(func);
+        instance.m_external_function->m_emit_timing = instance.m_performance_counters_enabled;
        auto cf = instance.m_external_function->make_call_frame();
        instance.m_call_frame = dynamic_pointer_cast<GPU_CallFrame>(cf);
-        m_function_map.insert({func, instance});
    }
    return true;
 }
@@ -67,20 +67,62 @@ bool runtime::gpu::GPU_Backend::call(shared_ptr<Function> func,
    validate_call(func, outputs, inputs);
-    auto it = m_function_map.find(func);
+    FunctionInstance& instance = m_function_map[func];
-    if (it == m_function_map.end())
+    if (instance.m_external_function == nullptr)
    {
-        compile(func);
+        rc = compile(func);
-        it = m_function_map.find(func);
    }
-    if (it == m_function_map.end())
+    instance.m_call_frame->call(outputs, inputs);
+    return rc;
+}
+void runtime::gpu::GPU_Backend::remove_compiled_function(shared_ptr<Function> func)
+{
+    m_function_map.erase(func);
+}
+void runtime::gpu::GPU_Backend::enable_performance_data(shared_ptr<Function> func, bool enable)
+{
+    FunctionInstance& instance = m_function_map[func];
+    if (instance.m_external_function != nullptr)
    {
-        throw runtime_error("Error constructing backend.");
+        throw runtime_error("Performance data collection must be enabled prior to compiling.");
    }
+    instance.m_performance_counters_enabled = enable;
+}
-    FunctionInstance& instance = it->second;
+vector<runtime::PerformanceCounter>
-    instance.m_call_frame->call(outputs, inputs);
+    runtime::gpu::GPU_Backend::get_performance_data(shared_ptr<Function> func) const
+{
+    std::vector<runtime::PerformanceCounter> rc;
+    auto it = m_function_map.find(func);
+    if (it != m_function_map.end())
+    {
+        const FunctionInstance& instance = it->second;
+        if (instance.m_external_function != nullptr)
+        {
+            auto* engine = instance.m_external_function->m_execution_engine.get();
+            if (engine)
+            {
+                auto get_count = engine->find_function<size_t()>("get_debug_timer_count");
+                auto get_name = engine->find_function<const char*(size_t)>("get_debug_timer_name");
+                auto get_microseconds =
+                    engine->find_function<size_t(size_t)>("get_debug_timer_microseconds");
+                auto get_call_count =
+                    engine->find_function<size_t(size_t)>("get_debug_timer_call_count");
+                if (get_count && get_name && get_microseconds && get_call_count)
+                {
+                    size_t count = get_count();
+                    for (size_t i = 0; i < count; i++)
+                    {
+                        rc.push_back({get_name(i), get_microseconds(i), get_call_count(i)});
+                    }
+                }
+            }
+        }
+    }
    return rc;
 }
--- a/src/ngraph/runtime/gpu/gpu_backend.hpp
+++ b/src/ngraph/runtime/gpu/gpu_backend.hpp
@@ -54,12 +54,18 @@ namespace ngraph
                          const std::vector<std::shared_ptr<runtime::TensorView>>& outputs,
                          const std::vector<std::shared_ptr<runtime::TensorView>>& inputs) override;
+                void remove_compiled_function(std::shared_ptr<Function> func) override;
+                void enable_performance_data(std::shared_ptr<Function> func, bool enable) override;
+                std::vector<PerformanceCounter>
+                    get_performance_data(std::shared_ptr<Function> func) const override;
            private:
                class FunctionInstance
                {
                public:
                    std::shared_ptr<GPU_ExternalFunction> m_external_function;
                    std::shared_ptr<GPU_CallFrame> m_call_frame;
+                    bool m_performance_counters_enabled = false;
                };
                std::map<std::shared_ptr<Function>, FunctionInstance> m_function_map;

--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -252,11 +252,10 @@ static const runtime::gpu::OpMap dispatcher{
 runtime::gpu::GPU_ExternalFunction::GPU_ExternalFunction(
    const shared_ptr<ngraph::Function>& function, bool release_function)
    : m_compiled_function(nullptr)
-    , m_emit_timing(std::getenv("NGRAPH_GPU_EMIT_TIMING") != nullptr)
+    , m_emit_timing(false)
    , m_function(function)
    , m_release_function(release_function)
    , m_is_compiled(false)
-    , m_timing(false)
    , m_ctx(new GPURuntimeContext)
 {
    // Create context use driver API and make it current, the runtime call will pickup the context
@@ -311,6 +310,12 @@ void runtime::gpu::GPU_ExternalFunction::compile()
    pass_manager.register_pass<pass::DumpSorted>(dump_filename);
    pass_manager.run_passes(m_function);
+    unordered_map<shared_ptr<Function>, list<shared_ptr<Node>>> function_ordered_ops;
+    for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
+    {
+        function_ordered_ops.insert({current_function, current_function->get_ordered_ops()});
+    }
    codegen::CodeWriter writer;
    writer +=
@@ -356,6 +361,52 @@ using namespace std;
    // to register cleanup handlers. We use it, and not atexit(), because
    // atexit() happens too late, when the JIT is no longer alive
    writer << "void *__dso_handle = 0;\n\n";
+    if (m_emit_timing)
+    {
+        writer << "// Declare debug timers\n";
+        vector<string> names;
+        size_t index = 0;
+        for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
+        {
+            for (shared_ptr<Node> node : function_ordered_ops.at(current_function))
+            {
+                if (!node->is_parameter() && !node->is_constant())
+                {
+                    names.push_back(node->get_name());
+                    m_name_index_map.insert({node->get_name(), index++});
+                }
+            }
+        }
+        writer << "ngraph::stopwatch timers[" << names.size() << "];\n";
+        writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
+               << "; }\n";
+        writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
+        writer.block_begin();
+        writer << "static const char* timer_names[" << names.size() << "] =\n";
+        writer.block_begin();
+        vector<string> quoted_names;
+        for (const string& name : names)
+        {
+            quoted_names.push_back("\"" + name + "\"");
+        }
+        writer << emit_string_array(quoted_names, 100 - (4 * 2 + 1));
+        writer.indent--;
+        writer << "\n};\n";
+        writer << "return timer_names[index];\n";
+        writer.block_end();
+        writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
+        writer.block_begin();
+        writer << "return (index < " << names.size()
+               << " ? timers[index].get_total_microseconds() : 0);\n";
+        writer.block_end();
+        writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
+        writer.block_begin();
+        writer << "return (index < " << names.size() << " ? timers[index].get_call_count() : 0);\n";
+        writer.block_end();
+        writer << "\n";
+    }
    writer << "// Declare all constants\n";
    for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
    {
@@ -769,7 +820,7 @@ void runtime::gpu::GPU_ExternalFunction::emit_debug_function_entry(
    const std::vector<GPU_TensorViewWrapper>& in,
    const std::vector<GPU_TensorViewWrapper>& out)
 {
-    writer << "timer_" << node->get_name() << ".start();\n";
+    writer << "timers[" << m_name_index_map[node->get_name()] << "].start();\n";
 }
 void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit(
@@ -778,7 +829,7 @@ void runtime::gpu::GPU_ExternalFunction::emit_debug_function_exit(
    const std::vector<GPU_TensorViewWrapper>& in,
    const std::vector<GPU_TensorViewWrapper>& out)
 {
-    writer << "timer_" << node->get_name() << ".stop();\n";
+    writer << "timers[" << m_name_index_map[node->get_name()] << "].stop();\n";
 }
 std::unique_ptr<runtime::gpu::GPURuntimeContext>& runtime::gpu::GPU_ExternalFunction::ctx()

--- a/src/ngraph/runtime/gpu/gpu_external_function.hpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.hpp
@@ -52,6 +52,7 @@ namespace ngraph
            class GPU_ExternalFunction : public std::enable_shared_from_this<GPU_ExternalFunction>
            {
                friend class GPU_CallFrame;
+                friend class GPU_Backend;
            public:
                GPU_ExternalFunction(const std::shared_ptr<ngraph::Function>& function,
@@ -94,10 +95,10 @@ namespace ngraph
                std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
                bool m_emit_timing;
                std::unordered_map<std::string, std::string> m_variable_name_map;
+                std::map<std::string, size_t> m_name_index_map;
                std::shared_ptr<ngraph::Function> m_function;
                bool m_release_function;
                bool m_is_compiled;
-                bool m_timing;
                cublasHandle_t m_cublas_handle;
                cudnnHandle_t m_cudnn_handle;