IntelGPU backend: minor fixes in statistic (#2300)

4680678d · Sergey Shalnov · Robert Kimball · f79b40a7 · 4680678d · 4680678d
Commit 4680678d authored Jan 12, 2019 by Sergey Shalnov Committed by Robert Kimball Jan 12, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 49 additions and 32 deletions

intelgpu_backend.cpp src/ngraph/runtime/intelgpu/intelgpu_backend.cpp +42 -27

intelgpu_backend.hpp src/ngraph/runtime/intelgpu/intelgpu_backend.hpp +7 -5

No files found.
--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -390,7 +390,14 @@ runtime::intelgpu::IntelGPUBackend::IntelGPUBackend()
        m_function_cache_disabled = true;
    }

-    cldnn::engine_configuration cldnn_configuration(profiling);
+    cldnn::engine_configuration cldnn_configuration(profiling,
+                                                    false,
+                                                    m_cldnn_dump_enable,
+                                                    string(),
+                                                    string(),
+                                                    true,
+                                                    string(),
+                                                    m_cldnn_dump_dir);
    ocl_engine = make_shared<cldnn::engine>(cldnn_configuration);
 }

@@ -419,6 +426,14 @@ runtime::Handle runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function>

    set<cldnn::primitive_id> func_output_names;
    cldnn::topology topology;
+    stopwatch timer_compile;
+    double mem_before_compile = 0.0;
+
+    if (m_profile_enable)
+    {
+        mem_before_compile = get_max_memory_rss();
+        timer_compile.start();
+    }

    if (m_dump_graph_enable)
    {
@@ -1808,6 +1823,13 @@ runtime::Handle runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function>
    instance.ocl_network =
        make_shared<cldnn::network>(*ocl_engine, topology, network_build_options);

+    if (m_profile_enable)
+    {
+        timer_compile.stop();
+        instance.m_compilation_time = timer_compile.get_milliseconds();
+        instance.m_consumed_memory = get_max_memory_rss() - mem_before_compile;
+    }
+
    return func;
 }

@@ -1815,17 +1837,8 @@ bool runtime::intelgpu::IntelGPUBackend::call(shared_ptr<Function> func,
                                              const vector<shared_ptr<runtime::Tensor>>& outputs,
                                              const vector<shared_ptr<runtime::Tensor>>& inputs)
 {
-    double mem_before_call = 0.0f;
-    double mem_after_compilation = 0.0f;
-    double mem_after_call = 0.0f;
+    double mem_call_consumed = 0.0f;
    stopwatch timer_call;
-    stopwatch timer_compile;
-
-    if (m_profile_enable)
-    {
-        mem_before_call = get_max_memory_rss();
-        timer_compile.start();
-    }

    FunctionInstance& instance = ocl_networks[func];
    if (instance.ocl_network == nullptr)
@@ -1835,8 +1848,7 @@ bool runtime::intelgpu::IntelGPUBackend::call(shared_ptr<Function> func,

    if (m_profile_enable)
    {
-        timer_compile.stop();
-        mem_after_compilation = get_max_memory_rss();
+        mem_call_consumed = get_max_memory_rss();
        timer_call.start();
    }

@@ -1884,15 +1896,18 @@ bool runtime::intelgpu::IntelGPUBackend::call(shared_ptr<Function> func,
    if (m_profile_enable)
    {
        timer_call.stop();
-        mem_after_call = get_max_memory_rss();
+        mem_call_consumed = get_max_memory_rss() - mem_call_consumed;

        print_call_performance(network,
                               func,
-                               timer_compile.get_milliseconds(),
+                               instance.m_compilation_time,
                               timer_call.get_milliseconds(),
-                               mem_before_call,
-                               mem_after_compilation,
-                               mem_after_call);
+                               instance.m_consumed_memory,
+                               mem_call_consumed,
+                               get_max_memory_rss());
+
+        // Output compile time only once
+        instance.m_compilation_time = 0.0;
    }

    if (m_function_cache_disabled)
@@ -1995,11 +2010,11 @@ static Node* get_node_by_name(const shared_ptr<Function> func, const string& nam
 void runtime::intelgpu::IntelGPUBackend::print_call_performance(
    const shared_ptr<cldnn::network> network,
    const shared_ptr<Function> func,
-    size_t time_compile,
-    size_t time_call,
-    double mem_before_call,
-    double mem_after_compilation,
-    double mem_after_call) const
+    double time_compile,
+    double time_call,
+    double mem_compilation_consumed,
+    double mem_call_consumed,
+    double mem_current) const
 {
    struct data_item
    {
@@ -2110,10 +2125,10 @@ void runtime::intelgpu::IntelGPUBackend::print_call_performance(
    }

    // Print time and memory consumed in ::call function
-    cout << func_name << delim << " Backend compilation(ms)" << delim << time_compile << " call(ms)"
-         << delim << time_call << delim << "memory before call(B)" << delim << mem_before_call
-         << delim << "after compilation(B)" << delim << mem_after_compilation << delim
-         << "after call(B)" << delim << mem_after_call << endl;
+    cout << func_name << delim << " Backend compilation(ms)" << delim << time_compile << delim
+         << "call(ms)" << delim << time_call << delim << "memory consumption compile(B)" << delim
+         << mem_compilation_consumed << delim << "call(B)" << delim << mem_call_consumed << delim
+         << "RSS(B)" << delim << mem_current << endl;

    cout.flags(saved_stream_flags); // Restore stream configuration to leave it in original state
 }
--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.hpp
@@ -64,6 +64,8 @@ private:
    public:
        std::shared_ptr<cldnn::network> ocl_network = nullptr;
        bool m_performance_counters_enabled = false;
+        double m_compilation_time = 0.0;
+        double m_consumed_memory = 0.0;
    };

    std::map<std::shared_ptr<Function>, FunctionInstance> ocl_networks;
@@ -74,11 +76,11 @@ private:
    // Statistic related things
    void print_call_performance(const std::shared_ptr<cldnn::network> network,
                                const std::shared_ptr<Function> func,
-                                size_t time_compile,
-                                size_t time_call,
-                                double mem_before_call,
-                                double mem_after_compilation,
-                                double mem_after_call) const;
+                                double time_compile,
+                                double time_call,
+                                double mem_compilation_consumed,
+                                double mem_call_consumed,
+                                double mem_current) const;

    bool m_profile_enable = false;
    long m_profile_lines_limit_count = 10;