......@@ -16,6 +16,7 @@
......@@ -15,8 +15,6 @@
#include <iomanip>
#include <sys/resource.h>
#include <sys/time.h>
#include <CPP/activation.hpp>
#include <CPP/activation_grad.hpp>
......@@ -37,9 +35,7 @@
#include <CPP/mutable_data.hpp>
#include <CPP/permute.hpp>
#include <CPP/pooling.hpp>
#include <CPP/reorder.hpp>
#include <CPP/reshape.hpp>
#include <CPP/scale.hpp>
#include <CPP/select.hpp>
#include <CPP/softmax.hpp>
#include <CPP/topology.hpp>
......@@ -51,6 +47,7 @@
#include "ngraph/pass/nop_elimination.hpp"
#include "ngraph/pass/reshape_elimination.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_backend.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_executable.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_layout.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_batchnorm.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_broadcast.hpp"
......@@ -141,21 +138,6 @@ static void arguments_check(const shared_ptr<Node>& op, size_t input, size_t out
static void
memory_size_check(size_t memory_size, const shared_ptr<Node>& node, const string& function_name)
const size_t tensor_size = shape_size(node->get_shape()) * node->get_element_type().size();
if (memory_size != tensor_size)
ostringstream os;
os << "IntelGPU backend failed memory check. In \"" << function_name << "\" with Node \""
<< node->get_name() << "\" and " << node->get_shape() << " mismatched memory sizes "
<< tensor_size << " and " << memory_size;
throw invalid_argument(os.str());
static const string& get_input_name(const shared_ptr<Node>& op, size_t num = 0)
return op->get_inputs().at(num).get_tensor().get_name();
......@@ -312,22 +294,6 @@ extern "C" void delete_backend(runtime::Backend* backend)
delete backend;
static size_t get_max_memory_rss()
size_t result = 0;
struct rusage usage;
if (getrusage(RUSAGE_SELF, &usage) == 0)
result = usage.ru_maxrss; // the value is in kilobytes
// aligne result to return bytes
result *= 1000;
return result;
bool profiling = false;
......@@ -433,7 +399,7 @@ shared_ptr<runtime::Executable>
if (m_profile_enable)
consumed_memory = get_max_memory_rss();
consumed_memory = runtime::intelgpu::get_max_memory_rss();
......@@ -1831,7 +1797,7 @@ shared_ptr<runtime::Executable>
compilation_time = timer_compile.get_milliseconds();
consumed_memory = get_max_memory_rss() - consumed_memory;
consumed_memory = runtime::intelgpu::get_max_memory_rss() - consumed_memory;
rc = make_shared<IntelGPUExecutable>(func,
......@@ -1849,102 +1815,6 @@ shared_ptr<runtime::Executable>
return rc;
runtime::intelgpu::IntelGPUExecutable::IntelGPUExecutable(shared_ptr<Function> func,
shared_ptr<cldnn::network> network,
bool enable_timing,
bool enable_profile,
double compilation_time,
double consumed_memory,
size_t profile_lines_limit_count)
m_function = func;
m_cldnn_network = network;
m_performance_counters_enabled = enable_timing;
m_profile_enable = enable_profile;
m_compilation_time = compilation_time;
m_consumed_memory = consumed_memory;
m_profile_lines_limit_count = profile_lines_limit_count;
bool runtime::intelgpu::IntelGPUExecutable::call(const vector<shared_ptr<runtime::Tensor>>& outputs,
const vector<shared_ptr<runtime::Tensor>>& inputs)
double mem_call_consumed = 0.0f;
stopwatch timer_call;
if (m_cldnn_network == nullptr)
throw runtime_error("compile() must be called before call().");
if (m_profile_enable)
mem_call_consumed = get_max_memory_rss();
// Process input parameters. Correctness of parameters was validated by validate_call.
// Since we have no correlation between Function::m_parameters and inputs, there is
// we try to match them by index number in vectors.
for (size_t i = 0; i < inputs.size(); i++)
shared_ptr<runtime::intelgpu::IntelGPUTensorView> tv =
const ParameterVector& input_params = get_parameters();
const string& tensor_name = input_params[i]->get_output_tensor().get_name();
m_cldnn_network->set_input_data(tensor_name, *tv->get_data_ptr());
// Execute network
map<cldnn::primitive_id, cldnn::network_output> result = m_cldnn_network->execute();
// Process output parameters. Correctness of parameters was validated by validate_call.
// Since we have no correlation between Function::m_results and outputs, there is
// we try to match them by index number in vectors.
for (size_t i = 0; i < m_function->get_output_size(); i++)
const shared_ptr<Node>& dst_node = m_function->get_output_op(i);
const size_t dst_shape_size = shape_size(dst_node->get_shape());
// We should not touch destination memory if it is not existed
if (!dst_shape_size)
shared_ptr<runtime::intelgpu::IntelGPUTensorView> ngraph_res =
const string& tensor_name = get_input_name(dst_node);
auto result_memory =<char>();
memory_size_check(result_memory.size(), dst_node, m_function->get_name());
ngraph_res->write(, 0, result_memory.size());
if (m_profile_enable)
mem_call_consumed = get_max_memory_rss() - mem_call_consumed;
// Output compile time only once
m_compilation_time = 0.0;
m_consumed_memory = 0.0;
return true;
void runtime::intelgpu::IntelGPUBackend::remove_compiled_function(shared_ptr<Executable> exec)
for (auto it = cldnn_networks.begin(); it != cldnn_networks.end(); ++it)
......@@ -1957,199 +1827,6 @@ void runtime::intelgpu::IntelGPUBackend::remove_compiled_function(shared_ptr<Exe
// The cldnn::network contains something like "generic_layer_0_Parameter_254_0" names
// This function should return "Parameter_254" from the example above
static string convert_cldnn_names(shared_ptr<Function> func, const string& cldnn_name)
const string key("_");
string result;
const size_t last_key = cldnn_name.rfind(key);
const size_t pre_last_key = cldnn_name.rfind(key, last_key - 1);
const size_t pre_pre_last_key = cldnn_name.rfind(key, pre_last_key - 1);
if (pre_pre_last_key == std::string::npos)
result = cldnn_name.substr(0, last_key);
result = cldnn_name.substr(pre_pre_last_key + 1, last_key - pre_pre_last_key - 1);
return result;
runtime::intelgpu::IntelGPUExecutable::get_performance_data() const
vector<runtime::PerformanceCounter> rc;
if (m_cldnn_network != nullptr && m_performance_counters_enabled)
const map<cldnn::primitive_id, cldnn::event>& primitives =
for (const auto& p : primitives)
// Let's generate the primitive name that matches to the name in Function
const string primitive_name = convert_cldnn_names(m_function, p.first);
size_t usec = 0;
for (const auto& q : p.second.get_profiling_info())
if ( == string("executing"))
usec += chrono::duration_cast<
chrono::duration<size_t, chrono::milliseconds::period>>(
const runtime::PerformanceCounter perf_counter(primitive_name.c_str(), usec, 1);
return rc;
static Node* get_node_by_name(const shared_ptr<Function> func, const string& name)
for (shared_ptr<Node> node : func->get_ops())
if (node->get_name() == name)
return node.get();
return nullptr;
void runtime::intelgpu::IntelGPUExecutable::print_call_performance(
const shared_ptr<cldnn::network> network,
const shared_ptr<Function> func,
double time_compile,
double time_call,
double mem_compilation_consumed,
double mem_call_consumed,
double mem_current) const
struct data_item
string item_name;
map<string, double> item_times;
const string& func_name = func->get_name();
const map<cldnn::primitive_id, cldnn::event>& primitives = network->get_executed_primitives();
size_t limit_count = m_profile_lines_limit_count;
multimap<double, data_item> data;
map<string, double> total_interval_times;
double total_executing_time = 0;
size_t total_items_count = 0;
size_t max_item_name_size = 0;
ios_base::fmtflags saved_stream_flags(cout.flags()); // Save stream flags to restore them later
if (m_profile_lines_limit_count > 0)
// Extract profiling statistic, calculate summary and sort
for (auto& prim : primitives)
double executing_time = 0;
data_item item;
item.item_name = prim.first;
max_item_name_size = max(max_item_name_size, prim.first.size());
for (auto& prof_info : prim.second.get_profiling_info())
const string& interval_name =;
double interval =
chrono::duration_cast<chrono::duration<double, chrono::milliseconds::period>>(
item.item_times[interval_name] = interval;
// Get the Key time to sort by
if (interval_name == "executing")
executing_time += interval;
// Accumulate total time for each interval
if (total_interval_times.find(interval_name) == total_interval_times.end())
total_interval_times[interval_name] = interval;
total_interval_times[interval_name] += interval;
data.emplace(executing_time, item);
total_executing_time += executing_time;
// Print statistic for each primitive in the cldnn::network
for (auto it = data.rbegin(); (it != data.rend()) && (limit_count > 0); ++it, --limit_count)
const string ngraph_node_name = convert_cldnn_names(func, it->second.item_name);
const Node* ngraph_node = get_node_by_name(func, ngraph_node_name);
cout << func_name << delim << setw(max_item_name_size) << it->second.item_name << delim
<< "time(ms)" << delim << scientific << setprecision(2) << it->first;
for (auto item : it->second.item_times)
cout << delim << item.first << "(ms)" << delim << item.second;
cout << delim << ngraph_node_name;
if (ngraph_node) // it might be initialized by nullptr
// print all input shapes for the Node
size_t arg_idx = 0;
for (const descriptor::Input& op_input : ngraph_node->get_inputs())
cout << delim << op_input.get_element_type().c_type_string() << " input"
<< arg_idx << vector_to_string(op_input.get_shape());
// print all output shapes for the Node
arg_idx = 0;
for (const descriptor::Output& op_output : ngraph_node->get_outputs())
cout << delim << op_output.get_element_type().c_type_string() << " output"
<< arg_idx << vector_to_string(op_output.get_shape());
cout << "\n";
// Print bottom line summary
const string total_items_count_string = "Total(cldnn " + to_string(total_items_count) +
", ngraph " + to_string(func->get_ops().size()) +
cout << func_name << delim << setw(max_item_name_size) << total_items_count_string << delim
<< "time(ms)" << delim << scientific << setprecision(2) << total_executing_time;
for (auto item_times : total_interval_times)
cout << delim << item_times.first << "(ms)" << delim << item_times.second;
cout << "\n";
// Print time and memory consumed in ::call function
cout << func_name << delim << " Backend compilation(ms)" << delim << time_compile << delim
<< "call(ms)" << delim << time_call << delim << "memory consumption compile(B)" << delim
<< mem_compilation_consumed << delim << "call(B)" << delim << mem_call_consumed << delim
<< "RSS(B)" << delim << mem_current << endl;
cout.flags(saved_stream_flags); // Restore stream configuration to leave it in original state
bool runtime::intelgpu::IntelGPUBackend::is_supported_property(const Property prop) const
if (prop == Property::memory_attach)
......@@ -20,7 +20,6 @@
#include <memory>
#include <CPP/engine.hpp>
#include <CPP/network.hpp>
#include "ngraph/runtime/backend.hpp"
......@@ -31,7 +30,6 @@ namespace ngraph
namespace intelgpu
class IntelGPUBackend;
class IntelGPUExecutable;
......@@ -67,39 +65,3 @@ private:
bool m_disable_backend_optimizations = false;
std::string m_cldnn_dump_dir = std::string("intelgpu_codegen");
class ngraph::runtime::intelgpu::IntelGPUExecutable : public runtime::Executable
IntelGPUExecutable(std::shared_ptr<Function> func,
std::shared_ptr<cldnn::network> network,
bool enable_timing,
bool enable_profile,
double compilation_time,
double consumed_memory,
size_t profile_lines_limit_count);
bool call(const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) override;
std::vector<PerformanceCounter> get_performance_data() const override;
std::shared_ptr<Function> m_function;
std::shared_ptr<cldnn::network> m_cldnn_network = nullptr;
bool m_performance_counters_enabled = false;
bool m_profile_enable = false;
double m_compilation_time = 0.0;
double m_consumed_memory = 0.0;
long m_profile_lines_limit_count = 10;
std::string delim = std::string(":");
// Statistic related things
void print_call_performance(const std::shared_ptr<cldnn::network> network,
const std::shared_ptr<Function> func,
double time_compile,
double time_call,
double mem_compilation_consumed,
double mem_call_consumed,
double mem_current) const;
// Copyright 2017-2019 Intel Corporation
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iomanip>
#include "ngraph/runtime/intelgpu/intelgpu_executable.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_op_custom_kernels.hpp"
#include "ngraph/runtime/intelgpu/intelgpu_tensor_view.hpp"
#include "ngraph/util.hpp"
using namespace std;
using namespace ngraph;
static void
memory_size_check(size_t memory_size, const shared_ptr<Node>& node, const string& function_name)
const size_t tensor_size = shape_size(node->get_shape()) * node->get_element_type().size();
if (memory_size != tensor_size)
ostringstream os;
os << "IntelGPU backend failed memory check. In \"" << function_name << "\" with Node \""
<< node->get_name() << "\" and " << node->get_shape() << " mismatched memory sizes "
<< tensor_size << " and " << memory_size;
throw invalid_argument(os.str());
static const string& get_input_name(const shared_ptr<Node>& op, size_t num = 0)
return op->get_inputs().at(num).get_tensor().get_name();
// The cldnn::network contains something like "generic_layer_0_Parameter_254_0" names
// This function should return "Parameter_254" from the example above
static string convert_cldnn_names(shared_ptr<Function> func, const string& cldnn_name)
const string key("_");
string result;
const size_t last_key = cldnn_name.rfind(key);
const size_t pre_last_key = cldnn_name.rfind(key, last_key - 1);
const size_t pre_pre_last_key = cldnn_name.rfind(key, pre_last_key - 1);
if (pre_pre_last_key == std::string::npos)
result = cldnn_name.substr(0, last_key);
result = cldnn_name.substr(pre_pre_last_key + 1, last_key - pre_pre_last_key - 1);
return result;
runtime::intelgpu::IntelGPUExecutable::IntelGPUExecutable(shared_ptr<Function> func,
shared_ptr<cldnn::network> network,
bool enable_timing,
bool enable_profile,
double compilation_time,
double consumed_memory,
size_t profile_lines_limit_count)
m_function = func;
m_cldnn_network = network;
m_performance_counters_enabled = enable_timing;
m_profile_enable = enable_profile;
m_compilation_time = compilation_time;
m_consumed_memory = consumed_memory;
m_profile_lines_limit_count = profile_lines_limit_count;
bool runtime::intelgpu::IntelGPUExecutable::call(const vector<shared_ptr<runtime::Tensor>>& outputs,
const vector<shared_ptr<runtime::Tensor>>& inputs)
double mem_call_consumed = 0.0f;
stopwatch timer_call;
if (m_cldnn_network == nullptr)
throw runtime_error("compile() must be called before call().");
if (m_profile_enable)
mem_call_consumed = runtime::intelgpu::get_max_memory_rss();
// Process input parameters. Correctness of parameters was validated by validate_call.
// Since we have no correlation between Function::m_parameters and inputs, there is
// we try to match them by index number in vectors.
for (size_t i = 0; i < inputs.size(); i++)
shared_ptr<runtime::intelgpu::IntelGPUTensorView> tv =
const ParameterVector& input_params = get_parameters();
const string& tensor_name = input_params[i]->get_output_tensor().get_name();
m_cldnn_network->set_input_data(tensor_name, *tv->get_data_ptr());
// Execute network
map<cldnn::primitive_id, cldnn::network_output> result = m_cldnn_network->execute();
// Process output parameters. Correctness of parameters was validated by validate_call.
// Since we have no correlation between Function::m_results and outputs, there is
// we try to match them by index number in vectors.
for (size_t i = 0; i < m_function->get_output_size(); i++)
const shared_ptr<Node>& dst_node = m_function->get_output_op(i);
const size_t dst_shape_size = shape_size(dst_node->get_shape());
// We should not touch destination memory if it is not existed
if (!dst_shape_size)
shared_ptr<runtime::intelgpu::IntelGPUTensorView> ngraph_res =
const string& tensor_name = get_input_name(dst_node);
auto result_memory =<char>();
memory_size_check(result_memory.size(), dst_node, m_function->get_name());
ngraph_res->write(, 0, result_memory.size());
if (m_profile_enable)
mem_call_consumed = runtime::intelgpu::get_max_memory_rss() - mem_call_consumed;
// Output compile time only once
m_compilation_time = 0.0;
m_consumed_memory = 0.0;
return true;
runtime::intelgpu::IntelGPUExecutable::get_performance_data() const
vector<runtime::PerformanceCounter> rc;
if (m_cldnn_network != nullptr && m_performance_counters_enabled)
const map<cldnn::primitive_id, cldnn::event>& primitives =
for (const auto& p : primitives)
// Let's generate the primitive name that matches to the name in Function
const string primitive_name = convert_cldnn_names(m_function, p.first);
size_t usec = 0;
for (const auto& q : p.second.get_profiling_info())
if ( == string("executing"))
usec += chrono::duration_cast<
chrono::duration<size_t, chrono::milliseconds::period>>(
const runtime::PerformanceCounter perf_counter(primitive_name.c_str(), usec, 1);
return rc;
static Node* get_node_by_name(const shared_ptr<Function> func, const string& name)
for (shared_ptr<Node> node : func->get_ops())
if (node->get_name() == name)
return node.get();
return nullptr;
void runtime::intelgpu::IntelGPUExecutable::print_call_performance(
const shared_ptr<cldnn::network> network,
const shared_ptr<Function> func,
double time_compile,
double time_call,
double mem_compilation_consumed,
double mem_call_consumed,
double mem_current) const
struct data_item
string item_name;
map<string, double> item_times;
const string& func_name = func->get_name();
const map<cldnn::primitive_id, cldnn::event>& primitives = network->get_executed_primitives();
size_t limit_count = m_profile_lines_limit_count;
multimap<double, data_item> data;
map<string, double> total_interval_times;
double total_executing_time = 0;
size_t total_items_count = 0;
size_t max_item_name_size = 0;
ios_base::fmtflags saved_stream_flags(cout.flags()); // Save stream flags to restore them later
if (m_profile_lines_limit_count > 0)
// Extract profiling statistic, calculate summary and sort
for (auto& prim : primitives)
double executing_time = 0;
data_item item;
item.item_name = prim.first;
max_item_name_size = max(max_item_name_size, prim.first.size());
for (auto& prof_info : prim.second.get_profiling_info())
const string& interval_name =;
double interval =
chrono::duration_cast<chrono::duration<double, chrono::milliseconds::period>>(
item.item_times[interval_name] = interval;
// Get the Key time to sort by
if (interval_name == "executing")
executing_time += interval;
// Accumulate total time for each interval
if (total_interval_times.find(interval_name) == total_interval_times.end())
total_interval_times[interval_name] = interval;
total_interval_times[interval_name] += interval;
data.emplace(executing_time, item);
total_executing_time += executing_time;
// Print statistic for each primitive in the cldnn::network
for (auto it = data.rbegin(); (it != data.rend()) && (limit_count > 0); ++it, --limit_count)
const string ngraph_node_name = convert_cldnn_names(func, it->second.item_name);
const Node* ngraph_node = get_node_by_name(func, ngraph_node_name);
cout << func_name << delim << setw(max_item_name_size) << it->second.item_name << delim
<< "time(ms)" << delim << scientific << setprecision(2) << it->first;
for (auto item : it->second.item_times)
cout << delim << item.first << "(ms)" << delim << item.second;
cout << delim << ngraph_node_name;
if (ngraph_node) // it might be initialized by nullptr
// print all input shapes for the Node
size_t arg_idx = 0;
for (const descriptor::Input& op_input : ngraph_node->get_inputs())
cout << delim << op_input.get_element_type().c_type_string() << " input"
<< arg_idx << vector_to_string(op_input.get_shape());
// print all output shapes for the Node
arg_idx = 0;
for (const descriptor::Output& op_output : ngraph_node->get_outputs())
cout << delim << op_output.get_element_type().c_type_string() << " output"
<< arg_idx << vector_to_string(op_output.get_shape());
cout << "\n";
// Print bottom line summary
const string total_items_count_string = "Total(cldnn " + to_string(total_items_count) +
", ngraph " + to_string(func->get_ops().size()) +
cout << func_name << delim << setw(max_item_name_size) << total_items_count_string << delim
<< "time(ms)" << delim << scientific << setprecision(2) << total_executing_time;
for (auto item_times : total_interval_times)
cout << delim << item_times.first << "(ms)" << delim << item_times.second;
cout << "\n";
// Print time and memory consumed in ::call function
cout << func_name << delim << " Backend compilation(ms)" << delim << time_compile << delim
<< "call(ms)" << delim << time_call << delim << "memory consumption compile(B)" << delim
<< mem_compilation_consumed << delim << "call(B)" << delim << mem_call_consumed << delim
<< "RSS(B)" << delim << mem_current << endl;
cout.flags(saved_stream_flags); // Restore stream configuration to leave it in original state
// Copyright 2017-2019 Intel Corporation
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <CPP/network.hpp>
#include "ngraph/runtime/tensor.hpp"
namespace ngraph
namespace runtime
namespace intelgpu
class IntelGPUExecutable;
class ngraph::runtime::intelgpu::IntelGPUExecutable : public runtime::Executable
IntelGPUExecutable(std::shared_ptr<Function> func,
std::shared_ptr<cldnn::network> network,
bool enable_timing,
bool enable_profile,
double compilation_time,
double consumed_memory,
size_t profile_lines_limit_count);
bool call(const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) override;
std::vector<PerformanceCounter> get_performance_data() const override;
std::shared_ptr<Function> m_function;
std::shared_ptr<cldnn::network> m_cldnn_network = nullptr;
bool m_performance_counters_enabled = false;
bool m_profile_enable = false;
double m_compilation_time = 0.0;
double m_consumed_memory = 0.0;
long m_profile_lines_limit_count = 10;
std::string delim = std::string(":");
// Statistic related things
void print_call_performance(const std::shared_ptr<cldnn::network> network,
const std::shared_ptr<Function> func,
double time_compile,
double time_call,
double mem_compilation_consumed,
double mem_call_consumed,
double mem_current) const;
......@@ -14,6 +14,9 @@
// limitations under the License.
#include <sys/resource.h>
#include <sys/time.h>
#include <CPP/concatenation.hpp>
#include <CPP/custom_gpu_primitive.hpp>
#include <CPP/reshape.hpp>
......@@ -1515,3 +1518,19 @@ void runtime::intelgpu::do_reshape_operation(cldnn::topology& topology,
size_t runtime::intelgpu::get_max_memory_rss()
size_t result = 0;
struct rusage usage;
if (getrusage(RUSAGE_SELF, &usage) == 0)
result = usage.ru_maxrss; // the value is in kilobytes
// aligne result to return bytes
result *= 1000;
return result;
......@@ -33,6 +33,8 @@ namespace ngraph
namespace intelgpu
size_t get_max_memory_rss();
void do_pad_operation(cldnn::topology& topology,
const std::string& input_name,
const Shape& input_shape,
