IntelGPU backend: Separate backend and executable classes (#2447)

013c2381 · Sergey Shalnov · Robert Kimball · 65141c5f · 013c2381 · 013c2381
Commit 013c2381 authored Feb 14, 2019 by Sergey Shalnov Committed by Robert Kimball Feb 14, 2019
7 changed files
--- a/src/ngraph/runtime/intelgpu/CMakeLists.txt
+++ b/src/ngraph/runtime/intelgpu/CMakeLists.txt
@@ -16,6 +16,7 @@
 set(SRC
    intelgpu_backend.cpp
+    intelgpu_executable.cpp
    intelgpu_tensor_view.cpp
    intelgpu_layout.cpp
    intelgpu_op_batchnorm.cpp

--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -15,8 +15,6 @@
 //*****************************************************************************
 #include <iomanip>
-#include <sys/resource.h>
-#include <sys/time.h>
 #include <CPP/activation.hpp>
 #include <CPP/activation_grad.hpp>
@@ -37,9 +35,7 @@
 #include <CPP/mutable_data.hpp>
 #include <CPP/permute.hpp>
 #include <CPP/pooling.hpp>
-#include <CPP/reorder.hpp>
 #include <CPP/reshape.hpp>
-#include <CPP/scale.hpp>
 #include <CPP/select.hpp>
 #include <CPP/softmax.hpp>
 #include <CPP/topology.hpp>
@@ -51,6 +47,7 @@
 #include "ngraph/pass/nop_elimination.hpp"
 #include "ngraph/pass/reshape_elimination.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_backend.hpp"
+#include "ngraph/runtime/intelgpu/intelgpu_executable.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp"
 #include "ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp"
@@ -141,21 +138,6 @@ static void arguments_check(const shared_ptr<Node>& op, size_t input, size_t out
    }
 }
-static void
-    memory_size_check(size_t memory_size, const shared_ptr<Node>& node, const string& function_name)
-{
-    const size_t tensor_size = shape_size(node->get_shape()) * node->get_element_type().size();
-    if (memory_size != tensor_size)
-    {
-        ostringstream os;
-        os << "IntelGPU backend failed memory check. In \"" << function_name << "\" with Node \""
-           << node->get_name() << "\" and " << node->get_shape() << " mismatched memory sizes "
-           << tensor_size << " and " << memory_size;
-        throw invalid_argument(os.str());
-    }
-}
 static const string& get_input_name(const shared_ptr<Node>& op, size_t num = 0)
 {
    return op->get_inputs().at(num).get_tensor().get_name();
@@ -312,22 +294,6 @@ extern "C" void delete_backend(runtime::Backend* backend)
    delete backend;
 }
-static size_t get_max_memory_rss()
-{
-    size_t result = 0;
-    struct rusage usage;
-    if (getrusage(RUSAGE_SELF, &usage) == 0)
-    {
-        result = usage.ru_maxrss; // the value is in kilobytes
-        // aligne result to return bytes
-        result *= 1000;
-    }
-    return result;
-}
 runtime::intelgpu::IntelGPUBackend::IntelGPUBackend()
 {
    bool profiling = false;
@@ -433,7 +399,7 @@ shared_ptr<runtime::Executable>
    if (m_profile_enable)
    {
-        consumed_memory = get_max_memory_rss();
+        consumed_memory = runtime::intelgpu::get_max_memory_rss();
        timer_compile.start();
    }
@@ -1831,7 +1797,7 @@ shared_ptr<runtime::Executable>
    {
        timer_compile.stop();
        compilation_time = timer_compile.get_milliseconds();
-        consumed_memory = get_max_memory_rss() - consumed_memory;
+        consumed_memory = runtime::intelgpu::get_max_memory_rss() - consumed_memory;
    }
    rc = make_shared<IntelGPUExecutable>(func,
@@ -1849,102 +1815,6 @@ shared_ptr<runtime::Executable>
    return rc;
 }
-runtime::intelgpu::IntelGPUExecutable::IntelGPUExecutable(shared_ptr<Function> func,
-                                                          shared_ptr<cldnn::network> network,
-                                                          bool enable_timing,
-                                                          bool enable_profile,
-                                                          double compilation_time,
-                                                          double consumed_memory,
-                                                          size_t profile_lines_limit_count)
-{
-    m_function = func;
-    m_cldnn_network = network;
-    m_performance_counters_enabled = enable_timing;
-    m_profile_enable = enable_profile;
-    m_compilation_time = compilation_time;
-    m_consumed_memory = consumed_memory;
-    m_profile_lines_limit_count = profile_lines_limit_count;
-    set_parameters_and_results(*func);
-}
-bool runtime::intelgpu::IntelGPUExecutable::call(const vector<shared_ptr<runtime::Tensor>>& outputs,
-                                                 const vector<shared_ptr<runtime::Tensor>>& inputs)
-{
-    double mem_call_consumed = 0.0f;
-    stopwatch timer_call;
-    if (m_cldnn_network == nullptr)
-    {
-        throw runtime_error("compile() must be called before call().");
-    }
-    if (m_profile_enable)
-    {
-        mem_call_consumed = get_max_memory_rss();
-        timer_call.start();
-    }
-    // Process input parameters. Correctness of parameters was validated by validate_call.
-    // Since we have no correlation between Function::m_parameters and inputs, there is
-    // we try to match them by index number in vectors.
-    for (size_t i = 0; i < inputs.size(); i++)
-    {
-        shared_ptr<runtime::intelgpu::IntelGPUTensorView> tv =
-            static_pointer_cast<runtime::intelgpu::IntelGPUTensorView>(inputs[i]);
-        const ParameterVector& input_params = get_parameters();
-        const string& tensor_name = input_params[i]->get_output_tensor().get_name();
-        m_cldnn_network->set_input_data(tensor_name, *tv->get_data_ptr());
-    }
-    // Execute network
-    map<cldnn::primitive_id, cldnn::network_output> result = m_cldnn_network->execute();
-    // Process output parameters. Correctness of parameters was validated by validate_call.
-    // Since we have no correlation between Function::m_results and outputs, there is
-    // we try to match them by index number in vectors.
-    for (size_t i = 0; i < m_function->get_output_size(); i++)
-    {
-        const shared_ptr<Node>& dst_node = m_function->get_output_op(i);
-        const size_t dst_shape_size = shape_size(dst_node->get_shape());
-        // We should not touch destination memory if it is not existed
-        if (!dst_shape_size)
-        {
-            continue;
-        }
-        shared_ptr<runtime::intelgpu::IntelGPUTensorView> ngraph_res =
-            static_pointer_cast<runtime::intelgpu::IntelGPUTensorView>(outputs[i]);
-        const string& tensor_name = get_input_name(dst_node);
-        auto result_memory = result.at(tensor_name).get_memory().pointer<char>();
-        memory_size_check(result_memory.size(), dst_node, m_function->get_name());
-        ngraph_res->write(result_memory.data(), 0, result_memory.size());
-    }
-    if (m_profile_enable)
-    {
-        timer_call.stop();
-        mem_call_consumed = get_max_memory_rss() - mem_call_consumed;
-        print_call_performance(m_cldnn_network,
-                               m_function,
-                               m_compilation_time,
-                               timer_call.get_milliseconds(),
-                               m_consumed_memory,
-                               mem_call_consumed,
-                               get_max_memory_rss());
-        // Output compile time only once
-        m_compilation_time = 0.0;
-        m_consumed_memory = 0.0;
-    }
-    return true;
-}
 void runtime::intelgpu::IntelGPUBackend::remove_compiled_function(shared_ptr<Executable> exec)
 {
    for (auto it = cldnn_networks.begin(); it != cldnn_networks.end(); ++it)
@@ -1957,199 +1827,6 @@ void runtime::intelgpu::IntelGPUBackend::remove_compiled_function(shared_ptr<Exe
    }
 }
-// The cldnn::network contains something like "generic_layer_0_Parameter_254_0" names
-// This function should return "Parameter_254" from the example above
-static string convert_cldnn_names(shared_ptr<Function> func, const string& cldnn_name)
-{
-    const string key("_");
-    string result;
-    const size_t last_key = cldnn_name.rfind(key);
-    const size_t pre_last_key = cldnn_name.rfind(key, last_key - 1);
-    const size_t pre_pre_last_key = cldnn_name.rfind(key, pre_last_key - 1);
-    if (pre_pre_last_key == std::string::npos)
-    {
-        result = cldnn_name.substr(0, last_key);
-    }
-    else
-    {
-        result = cldnn_name.substr(pre_pre_last_key + 1, last_key - pre_pre_last_key - 1);
-    }
-    return result;
-}
-vector<runtime::PerformanceCounter>
-    runtime::intelgpu::IntelGPUExecutable::get_performance_data() const
-{
-    vector<runtime::PerformanceCounter> rc;
-    if (m_cldnn_network != nullptr && m_performance_counters_enabled)
-    {
-        const map<cldnn::primitive_id, cldnn::event>& primitives =
-            m_cldnn_network->get_executed_primitives();
-        for (const auto& p : primitives)
-        {
-            // Let's generate the primitive name that matches to the name in Function
-            const string primitive_name = convert_cldnn_names(m_function, p.first);
-            size_t usec = 0;
-            for (const auto& q : p.second.get_profiling_info())
-            {
-                if (q.name == string("executing"))
-                {
-                    usec += chrono::duration_cast<
-                                chrono::duration<size_t, chrono::milliseconds::period>>(
-                                q.value->value())
-                                .count();
-                }
-            }
-            const runtime::PerformanceCounter perf_counter(primitive_name.c_str(), usec, 1);
-            rc.push_back(perf_counter);
-        }
-    }
-    return rc;
-}
-static Node* get_node_by_name(const shared_ptr<Function> func, const string& name)
-{
-    for (shared_ptr<Node> node : func->get_ops())
-    {
-        if (node->get_name() == name)
-        {
-            return node.get();
-        }
-    }
-    return nullptr;
-}
-void runtime::intelgpu::IntelGPUExecutable::print_call_performance(
-    const shared_ptr<cldnn::network> network,
-    const shared_ptr<Function> func,
-    double time_compile,
-    double time_call,
-    double mem_compilation_consumed,
-    double mem_call_consumed,
-    double mem_current) const
-{
-    struct data_item
-    {
-        string item_name;
-        map<string, double> item_times;
-    };
-    const string& func_name = func->get_name();
-    const map<cldnn::primitive_id, cldnn::event>& primitives = network->get_executed_primitives();
-    size_t limit_count = m_profile_lines_limit_count;
-    multimap<double, data_item> data;
-    map<string, double> total_interval_times;
-    double total_executing_time = 0;
-    size_t total_items_count = 0;
-    size_t max_item_name_size = 0;
-    ios_base::fmtflags saved_stream_flags(cout.flags()); // Save stream flags to restore them later
-    if (m_profile_lines_limit_count > 0)
-    {
-        // Extract profiling statistic, calculate summary and sort
-        for (auto& prim : primitives)
-        {
-            double executing_time = 0;
-            data_item item;
-            item.item_name = prim.first;
-            max_item_name_size = max(max_item_name_size, prim.first.size());
-            for (auto& prof_info : prim.second.get_profiling_info())
-            {
-                const string& interval_name = prof_info.name;
-                double interval =
-                    chrono::duration_cast<chrono::duration<double, chrono::milliseconds::period>>(
-                        prof_info.value->value())
-                        .count();
-                item.item_times[interval_name] = interval;
-                // Get the Key time to sort by
-                if (interval_name == "executing")
-                {
-                    executing_time += interval;
-                }
-                // Accumulate total time for each interval
-                if (total_interval_times.find(interval_name) == total_interval_times.end())
-                {
-                    total_interval_times[interval_name] = interval;
-                }
-                else
-                {
-                    total_interval_times[interval_name] += interval;
-                }
-            }
-            data.emplace(executing_time, item);
-            total_executing_time += executing_time;
-            ++total_items_count;
-        }
-        // Print statistic for each primitive in the cldnn::network
-        for (auto it = data.rbegin(); (it != data.rend()) && (limit_count > 0); ++it, --limit_count)
-        {
-            const string ngraph_node_name = convert_cldnn_names(func, it->second.item_name);
-            const Node* ngraph_node = get_node_by_name(func, ngraph_node_name);
-            cout << func_name << delim << setw(max_item_name_size) << it->second.item_name << delim
-                 << "time(ms)" << delim << scientific << setprecision(2) << it->first;
-            for (auto item : it->second.item_times)
-            {
-                cout << delim << item.first << "(ms)" << delim << item.second;
-            }
-            cout << delim << ngraph_node_name;
-            if (ngraph_node) // it might be initialized by nullptr
-            {
-                // print all input shapes for the Node
-                size_t arg_idx = 0;
-                for (const descriptor::Input& op_input : ngraph_node->get_inputs())
-                {
-                    cout << delim << op_input.get_element_type().c_type_string() << " input"
-                         << arg_idx << vector_to_string(op_input.get_shape());
-                    ++arg_idx;
-                }
-                // print all output shapes for the Node
-                arg_idx = 0;
-                for (const descriptor::Output& op_output : ngraph_node->get_outputs())
-                {
-                    cout << delim << op_output.get_element_type().c_type_string() << " output"
-                         << arg_idx << vector_to_string(op_output.get_shape());
-                    ++arg_idx;
-                }
-            }
-            cout << "\n";
-        }
-        // Print bottom line summary
-        const string total_items_count_string = "Total(cldnn " + to_string(total_items_count) +
-                                                ", ngraph " + to_string(func->get_ops().size()) +
-                                                ")";
-        cout << func_name << delim << setw(max_item_name_size) << total_items_count_string << delim
-             << "time(ms)" << delim << scientific << setprecision(2) << total_executing_time;
-        for (auto item_times : total_interval_times)
-        {
-            cout << delim << item_times.first << "(ms)" << delim << item_times.second;
-        }
-        cout << "\n";
-    }
-    // Print time and memory consumed in ::call function
-    cout << func_name << delim << " Backend compilation(ms)" << delim << time_compile << delim
-         << "call(ms)" << delim << time_call << delim << "memory consumption compile(B)" << delim
-         << mem_compilation_consumed << delim << "call(B)" << delim << mem_call_consumed << delim
-         << "RSS(B)" << delim << mem_current << endl;
-    cout.flags(saved_stream_flags); // Restore stream configuration to leave it in original state
-}
 bool runtime::intelgpu::IntelGPUBackend::is_supported_property(const Property prop) const
 {
    if (prop == Property::memory_attach)

--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.hpp
@@ -20,7 +20,6 @@
 #include <memory>
 #include <CPP/engine.hpp>
-#include <CPP/network.hpp>
 #include "ngraph/runtime/backend.hpp"
@@ -31,7 +30,6 @@ namespace ngraph
        namespace intelgpu
        {
            class IntelGPUBackend;
-            class IntelGPUExecutable;
        }
    }
 }
@@ -67,39 +65,3 @@ private:
    bool m_disable_backend_optimizations = false;
    std::string m_cldnn_dump_dir = std::string("intelgpu_codegen");
 };
-class ngraph::runtime::intelgpu::IntelGPUExecutable : public runtime::Executable
-{
-public:
-    IntelGPUExecutable(std::shared_ptr<Function> func,
-                       std::shared_ptr<cldnn::network> network,
-                       bool enable_timing,
-                       bool enable_profile,
-                       double compilation_time,
-                       double consumed_memory,
-                       size_t profile_lines_limit_count);
-    bool call(const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
-              const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) override;
-    std::vector<PerformanceCounter> get_performance_data() const override;
-private:
-    std::shared_ptr<Function> m_function;
-    std::shared_ptr<cldnn::network> m_cldnn_network = nullptr;
-    bool m_performance_counters_enabled = false;
-    bool m_profile_enable = false;
-    double m_compilation_time = 0.0;
-    double m_consumed_memory = 0.0;
-    long m_profile_lines_limit_count = 10;
-    std::string delim = std::string(":");
-    // Statistic related things
-    void print_call_performance(const std::shared_ptr<cldnn::network> network,
-                                const std::shared_ptr<Function> func,
-                                double time_compile,
-                                double time_call,
-                                double mem_compilation_consumed,
-                                double mem_call_consumed,
-                                double mem_current) const;
-};
--- a/src/ngraph/runtime/intelgpu/intelgpu_executable.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_executable.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include <iomanip>
+#include "ngraph/runtime/intelgpu/intelgpu_executable.hpp"
+#include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
+#include "ngraph/runtime/intelgpu/intelgpu_tensor_view.hpp"
+#include "ngraph/util.hpp"
+using namespace std;
+using namespace ngraph;
+static void
+    memory_size_check(size_t memory_size, const shared_ptr<Node>& node, const string& function_name)
+{
+    const size_t tensor_size = shape_size(node->get_shape()) * node->get_element_type().size();
+    if (memory_size != tensor_size)
+    {
+        ostringstream os;
+        os << "IntelGPU backend failed memory check. In \"" << function_name << "\" with Node \""
+           << node->get_name() << "\" and " << node->get_shape() << " mismatched memory sizes "
+           << tensor_size << " and " << memory_size;
+        throw invalid_argument(os.str());
+    }
+}
+static const string& get_input_name(const shared_ptr<Node>& op, size_t num = 0)
+{
+    return op->get_inputs().at(num).get_tensor().get_name();
+}
+// The cldnn::network contains something like "generic_layer_0_Parameter_254_0" names
+// This function should return "Parameter_254" from the example above
+static string convert_cldnn_names(shared_ptr<Function> func, const string& cldnn_name)
+{
+    const string key("_");
+    string result;
+    const size_t last_key = cldnn_name.rfind(key);
+    const size_t pre_last_key = cldnn_name.rfind(key, last_key - 1);
+    const size_t pre_pre_last_key = cldnn_name.rfind(key, pre_last_key - 1);
+    if (pre_pre_last_key == std::string::npos)
+    {
+        result = cldnn_name.substr(0, last_key);
+    }
+    else
+    {
+        result = cldnn_name.substr(pre_pre_last_key + 1, last_key - pre_pre_last_key - 1);
+    }
+    return result;
+}
+runtime::intelgpu::IntelGPUExecutable::IntelGPUExecutable(shared_ptr<Function> func,
+                                                          shared_ptr<cldnn::network> network,
+                                                          bool enable_timing,
+                                                          bool enable_profile,
+                                                          double compilation_time,
+                                                          double consumed_memory,
+                                                          size_t profile_lines_limit_count)
+{
+    m_function = func;
+    m_cldnn_network = network;
+    m_performance_counters_enabled = enable_timing;
+    m_profile_enable = enable_profile;
+    m_compilation_time = compilation_time;
+    m_consumed_memory = consumed_memory;
+    m_profile_lines_limit_count = profile_lines_limit_count;
+    set_parameters_and_results(*func);
+}
+bool runtime::intelgpu::IntelGPUExecutable::call(const vector<shared_ptr<runtime::Tensor>>& outputs,
+                                                 const vector<shared_ptr<runtime::Tensor>>& inputs)
+{
+    double mem_call_consumed = 0.0f;
+    stopwatch timer_call;
+    if (m_cldnn_network == nullptr)
+    {
+        throw runtime_error("compile() must be called before call().");
+    }
+    if (m_profile_enable)
+    {
+        mem_call_consumed = runtime::intelgpu::get_max_memory_rss();
+        timer_call.start();
+    }
+    // Process input parameters. Correctness of parameters was validated by validate_call.
+    // Since we have no correlation between Function::m_parameters and inputs, there is
+    // we try to match them by index number in vectors.
+    for (size_t i = 0; i < inputs.size(); i++)
+    {
+        shared_ptr<runtime::intelgpu::IntelGPUTensorView> tv =
+            static_pointer_cast<runtime::intelgpu::IntelGPUTensorView>(inputs[i]);
+        const ParameterVector& input_params = get_parameters();
+        const string& tensor_name = input_params[i]->get_output_tensor().get_name();
+        m_cldnn_network->set_input_data(tensor_name, *tv->get_data_ptr());
+    }
+    // Execute network
+    map<cldnn::primitive_id, cldnn::network_output> result = m_cldnn_network->execute();
+    // Process output parameters. Correctness of parameters was validated by validate_call.
+    // Since we have no correlation between Function::m_results and outputs, there is
+    // we try to match them by index number in vectors.
+    for (size_t i = 0; i < m_function->get_output_size(); i++)
+    {
+        const shared_ptr<Node>& dst_node = m_function->get_output_op(i);
+        const size_t dst_shape_size = shape_size(dst_node->get_shape());
+        // We should not touch destination memory if it is not existed
+        if (!dst_shape_size)
+        {
+            continue;
+        }
+        shared_ptr<runtime::intelgpu::IntelGPUTensorView> ngraph_res =
+            static_pointer_cast<runtime::intelgpu::IntelGPUTensorView>(outputs[i]);
+        const string& tensor_name = get_input_name(dst_node);
+        auto result_memory = result.at(tensor_name).get_memory().pointer<char>();
+        memory_size_check(result_memory.size(), dst_node, m_function->get_name());
+        ngraph_res->write(result_memory.data(), 0, result_memory.size());
+    }
+    if (m_profile_enable)
+    {
+        timer_call.stop();
+        mem_call_consumed = runtime::intelgpu::get_max_memory_rss() - mem_call_consumed;
+        print_call_performance(m_cldnn_network,
+                               m_function,
+                               m_compilation_time,
+                               timer_call.get_milliseconds(),
+                               m_consumed_memory,
+                               mem_call_consumed,
+                               runtime::intelgpu::get_max_memory_rss());
+        // Output compile time only once
+        m_compilation_time = 0.0;
+        m_consumed_memory = 0.0;
+    }
+    return true;
+}
+vector<runtime::PerformanceCounter>
+    runtime::intelgpu::IntelGPUExecutable::get_performance_data() const
+{
+    vector<runtime::PerformanceCounter> rc;
+    if (m_cldnn_network != nullptr && m_performance_counters_enabled)
+    {
+        const map<cldnn::primitive_id, cldnn::event>& primitives =
+            m_cldnn_network->get_executed_primitives();
+        for (const auto& p : primitives)
+        {
+            // Let's generate the primitive name that matches to the name in Function
+            const string primitive_name = convert_cldnn_names(m_function, p.first);
+            size_t usec = 0;
+            for (const auto& q : p.second.get_profiling_info())
+            {
+                if (q.name == string("executing"))
+                {
+                    usec += chrono::duration_cast<
+                                chrono::duration<size_t, chrono::milliseconds::period>>(
+                                q.value->value())
+                                .count();
+                }
+            }
+            const runtime::PerformanceCounter perf_counter(primitive_name.c_str(), usec, 1);
+            rc.push_back(perf_counter);
+        }
+    }
+    return rc;
+}
+static Node* get_node_by_name(const shared_ptr<Function> func, const string& name)
+{
+    for (shared_ptr<Node> node : func->get_ops())
+    {
+        if (node->get_name() == name)
+        {
+            return node.get();
+        }
+    }
+    return nullptr;
+}
+void runtime::intelgpu::IntelGPUExecutable::print_call_performance(
+    const shared_ptr<cldnn::network> network,
+    const shared_ptr<Function> func,
+    double time_compile,
+    double time_call,
+    double mem_compilation_consumed,
+    double mem_call_consumed,
+    double mem_current) const
+{
+    struct data_item
+    {
+        string item_name;
+        map<string, double> item_times;
+    };
+    const string& func_name = func->get_name();
+    const map<cldnn::primitive_id, cldnn::event>& primitives = network->get_executed_primitives();
+    size_t limit_count = m_profile_lines_limit_count;
+    multimap<double, data_item> data;
+    map<string, double> total_interval_times;
+    double total_executing_time = 0;
+    size_t total_items_count = 0;
+    size_t max_item_name_size = 0;
+    ios_base::fmtflags saved_stream_flags(cout.flags()); // Save stream flags to restore them later
+    if (m_profile_lines_limit_count > 0)
+    {
+        // Extract profiling statistic, calculate summary and sort
+        for (auto& prim : primitives)
+        {
+            double executing_time = 0;
+            data_item item;
+            item.item_name = prim.first;
+            max_item_name_size = max(max_item_name_size, prim.first.size());
+            for (auto& prof_info : prim.second.get_profiling_info())
+            {
+                const string& interval_name = prof_info.name;
+                double interval =
+                    chrono::duration_cast<chrono::duration<double, chrono::milliseconds::period>>(
+                        prof_info.value->value())
+                        .count();
+                item.item_times[interval_name] = interval;
+                // Get the Key time to sort by
+                if (interval_name == "executing")
+                {
+                    executing_time += interval;
+                }
+                // Accumulate total time for each interval
+                if (total_interval_times.find(interval_name) == total_interval_times.end())
+                {
+                    total_interval_times[interval_name] = interval;
+                }
+                else
+                {
+                    total_interval_times[interval_name] += interval;
+                }
+            }
+            data.emplace(executing_time, item);
+            total_executing_time += executing_time;
+            ++total_items_count;
+        }
+        // Print statistic for each primitive in the cldnn::network
+        for (auto it = data.rbegin(); (it != data.rend()) && (limit_count > 0); ++it, --limit_count)
+        {
+            const string ngraph_node_name = convert_cldnn_names(func, it->second.item_name);
+            const Node* ngraph_node = get_node_by_name(func, ngraph_node_name);
+            cout << func_name << delim << setw(max_item_name_size) << it->second.item_name << delim
+                 << "time(ms)" << delim << scientific << setprecision(2) << it->first;
+            for (auto item : it->second.item_times)
+            {
+                cout << delim << item.first << "(ms)" << delim << item.second;
+            }
+            cout << delim << ngraph_node_name;
+            if (ngraph_node) // it might be initialized by nullptr
+            {
+                // print all input shapes for the Node
+                size_t arg_idx = 0;
+                for (const descriptor::Input& op_input : ngraph_node->get_inputs())
+                {
+                    cout << delim << op_input.get_element_type().c_type_string() << " input"
+                         << arg_idx << vector_to_string(op_input.get_shape());
+                    ++arg_idx;
+                }
+                // print all output shapes for the Node
+                arg_idx = 0;
+                for (const descriptor::Output& op_output : ngraph_node->get_outputs())
+                {
+                    cout << delim << op_output.get_element_type().c_type_string() << " output"
+                         << arg_idx << vector_to_string(op_output.get_shape());
+                    ++arg_idx;
+                }
+            }
+            cout << "\n";
+        }
+        // Print bottom line summary
+        const string total_items_count_string = "Total(cldnn " + to_string(total_items_count) +
+                                                ", ngraph " + to_string(func->get_ops().size()) +
+                                                ")";
+        cout << func_name << delim << setw(max_item_name_size) << total_items_count_string << delim
+             << "time(ms)" << delim << scientific << setprecision(2) << total_executing_time;
+        for (auto item_times : total_interval_times)
+        {
+            cout << delim << item_times.first << "(ms)" << delim << item_times.second;
+        }
+        cout << "\n";
+    }
+    // Print time and memory consumed in ::call function
+    cout << func_name << delim << " Backend compilation(ms)" << delim << time_compile << delim
+         << "call(ms)" << delim << time_call << delim << "memory consumption compile(B)" << delim
+         << mem_compilation_consumed << delim << "call(B)" << delim << mem_call_consumed << delim
+         << "RSS(B)" << delim << mem_current << endl;
+    cout.flags(saved_stream_flags); // Restore stream configuration to leave it in original state
+}
--- a/src/ngraph/runtime/intelgpu/intelgpu_executable.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_executable.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include <CPP/network.hpp>
+#include "ngraph/runtime/tensor.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace intelgpu
+        {
+            class IntelGPUExecutable;
+        }
+    }
+}
+class ngraph::runtime::intelgpu::IntelGPUExecutable : public runtime::Executable
+{
+public:
+    IntelGPUExecutable(std::shared_ptr<Function> func,
+                       std::shared_ptr<cldnn::network> network,
+                       bool enable_timing,
+                       bool enable_profile,
+                       double compilation_time,
+                       double consumed_memory,
+                       size_t profile_lines_limit_count);
+    bool call(const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
+              const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) override;
+    std::vector<PerformanceCounter> get_performance_data() const override;
+private:
+    std::shared_ptr<Function> m_function;
+    std::shared_ptr<cldnn::network> m_cldnn_network = nullptr;
+    bool m_performance_counters_enabled = false;
+    bool m_profile_enable = false;
+    double m_compilation_time = 0.0;
+    double m_consumed_memory = 0.0;
+    long m_profile_lines_limit_count = 10;
+    std::string delim = std::string(":");
+    // Statistic related things
+    void print_call_performance(const std::shared_ptr<cldnn::network> network,
+                                const std::shared_ptr<Function> func,
+                                double time_compile,
+                                double time_call,
+                                double mem_compilation_consumed,
+                                double mem_call_consumed,
+                                double mem_current) const;
+};
--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.cpp
@@ -14,6 +14,9 @@
 // limitations under the License.
 //*****************************************************************************
+#include <sys/resource.h>
+#include <sys/time.h>
 #include <CPP/concatenation.hpp>
 #include <CPP/custom_gpu_primitive.hpp>
 #include <CPP/reshape.hpp>
@@ -1515,3 +1518,19 @@ void runtime::intelgpu::do_reshape_operation(cldnn::topology& topology,
                                                 {1});
    topology.add(op_reshape);
 }
+size_t runtime::intelgpu::get_max_memory_rss()
+{
+    size_t result = 0;
+    struct rusage usage;
+    if (getrusage(RUSAGE_SELF, &usage) == 0)
+    {
+        result = usage.ru_maxrss; // the value is in kilobytes
+        // aligne result to return bytes
+        result *= 1000;
+    }
+    return result;
+}
--- a/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp
@@ -33,6 +33,8 @@ namespace ngraph
    {
        namespace intelgpu
        {
+            size_t get_max_memory_rss();
            void do_pad_operation(cldnn::topology& topology,
                                  const std::string& input_name,
                                  const Shape& input_shape,