Commit ea5056fe authored by Sergey Shalnov's avatar Sergey Shalnov

IntelGPU backend: PR2378 backend stability fixed

parent 6a777636
...@@ -396,7 +396,7 @@ runtime::intelgpu::IntelGPUBackend::IntelGPUBackend() ...@@ -396,7 +396,7 @@ runtime::intelgpu::IntelGPUBackend::IntelGPUBackend()
true, true,
string(), string(),
m_cldnn_dump_dir); m_cldnn_dump_dir);
ocl_engine = make_shared<cldnn::engine>(cldnn_configuration); cldnn_engine = make_shared<cldnn::engine>(cldnn_configuration);
} }
shared_ptr<runtime::Tensor> shared_ptr<runtime::Tensor>
...@@ -404,50 +404,36 @@ shared_ptr<runtime::Tensor> ...@@ -404,50 +404,36 @@ shared_ptr<runtime::Tensor>
const Shape& shape) const Shape& shape)
{ {
return make_shared<runtime::intelgpu::IntelGPUTensorView>( return make_shared<runtime::intelgpu::IntelGPUTensorView>(
element_type, shape, *ocl_engine, nullptr, this); element_type, shape, *cldnn_engine, nullptr, this);
} }
shared_ptr<runtime::Tensor> runtime::intelgpu::IntelGPUBackend::create_tensor( shared_ptr<runtime::Tensor> runtime::intelgpu::IntelGPUBackend::create_tensor(
const element::Type& element_type, const Shape& shape, void* memory_pointer) const element::Type& element_type, const Shape& shape, void* memory_pointer)
{ {
return make_shared<runtime::intelgpu::IntelGPUTensorView>( return make_shared<runtime::intelgpu::IntelGPUTensorView>(
element_type, shape, *ocl_engine, memory_pointer, this); element_type, shape, *cldnn_engine, memory_pointer, this);
} }
shared_ptr<runtime::Executable> shared_ptr<runtime::Executable>
runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func, bool enable_timing) runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func, bool enable_timing)
{ {
shared_ptr<runtime::Executable> rc; shared_ptr<runtime::Executable> rc;
auto it = ocl_networks.find(func);
if (it != ocl_networks.end()) auto it = cldnn_networks.find(func);
{ if (it != cldnn_networks.end())
rc = it->second;
}
else
{
rc = make_shared<IntelGPUExecutable>(func, enable_timing);
if (!m_function_cache_disabled)
{ {
ocl_networks.insert({func, rc}); return it->second;
}
} }
return rc;
}
runtime::intelgpu::IntelGPUExecutable::IntelGPUExecutable(shared_ptr<Function> func,
bool enable_timing)
{
FunctionInstance& instance = m_function_instance;
instance.m_function = func;
set<cldnn::primitive_id> func_output_names; set<cldnn::primitive_id> func_output_names;
cldnn::topology topology; cldnn::topology topology;
stopwatch timer_compile; stopwatch timer_compile;
double mem_before_compile = 0.0; double consumed_memory = 0.0;
double compilation_time = 0.0;
if (m_profile_enable) if (m_profile_enable)
{ {
mem_before_compile = get_max_memory_rss(); consumed_memory = get_max_memory_rss();
timer_compile.start(); timer_compile.start();
} }
...@@ -1441,7 +1427,8 @@ runtime::intelgpu::IntelGPUExecutable::IntelGPUExecutable(shared_ptr<Function> f ...@@ -1441,7 +1427,8 @@ runtime::intelgpu::IntelGPUExecutable::IntelGPUExecutable(shared_ptr<Function> f
// Create a memory for mean as mutable_data to treat it as constant // Create a memory for mean as mutable_data to treat it as constant
const cldnn::layout mean_layout = IntelGPULayout::create_cldnn_layout( const cldnn::layout mean_layout = IntelGPULayout::create_cldnn_layout(
get_output_type(op, 1), get_output_shape(op, 1)); get_output_type(op, 1), get_output_shape(op, 1));
const cldnn::memory mean_mem(cldnn::memory::allocate(*ocl_engine, mean_layout)); const cldnn::memory mean_mem(
cldnn::memory::allocate(*cldnn_engine, mean_layout));
const cldnn::mutable_data mean_const(mean_name, mean_mem); const cldnn::mutable_data mean_const(mean_name, mean_mem);
topology.add(mean_const); topology.add(mean_const);
...@@ -1450,7 +1437,7 @@ runtime::intelgpu::IntelGPUExecutable::IntelGPUExecutable(shared_ptr<Function> f ...@@ -1450,7 +1437,7 @@ runtime::intelgpu::IntelGPUExecutable::IntelGPUExecutable(shared_ptr<Function> f
const cldnn::layout variance_layout = IntelGPULayout::create_cldnn_layout( const cldnn::layout variance_layout = IntelGPULayout::create_cldnn_layout(
get_output_type(op, 2), get_output_shape(op, 2)); get_output_type(op, 2), get_output_shape(op, 2));
const cldnn::memory variance_mem( const cldnn::memory variance_mem(
cldnn::memory::allocate(*ocl_engine, variance_layout)); cldnn::memory::allocate(*cldnn_engine, variance_layout));
const cldnn::mutable_data variance_const(variance_name, variance_mem); const cldnn::mutable_data variance_const(variance_name, variance_mem);
topology.add(variance_const); topology.add(variance_const);
...@@ -1819,15 +1806,48 @@ runtime::intelgpu::IntelGPUExecutable::IntelGPUExecutable(shared_ptr<Function> f ...@@ -1819,15 +1806,48 @@ runtime::intelgpu::IntelGPUExecutable::IntelGPUExecutable(shared_ptr<Function> f
network_build_options.set_option(cldnn::build_option::graph_dumps_dir(m_cldnn_dump_dir)); network_build_options.set_option(cldnn::build_option::graph_dumps_dir(m_cldnn_dump_dir));
} }
instance.ocl_network = shared_ptr<cldnn::network> cldnn_network =
make_shared<cldnn::network>(*ocl_engine, topology, network_build_options); make_shared<cldnn::network>(*cldnn_engine, topology, network_build_options);
if (m_profile_enable) if (m_profile_enable)
{ {
timer_compile.stop(); timer_compile.stop();
instance.m_compilation_time = timer_compile.get_milliseconds(); compilation_time = timer_compile.get_milliseconds();
instance.m_consumed_memory = get_max_memory_rss() - mem_before_compile; consumed_memory = get_max_memory_rss() - consumed_memory;
}
rc = make_shared<IntelGPUExecutable>(func,
cldnn_network,
enable_timing,
m_profile_enable,
compilation_time,
consumed_memory,
m_profile_lines_limit_count);
if (!m_function_cache_disabled)
{
cldnn_networks.insert({func, rc});
} }
return rc;
}
runtime::intelgpu::IntelGPUExecutable::IntelGPUExecutable(shared_ptr<Function> func,
shared_ptr<cldnn::network> network,
bool enable_timing,
bool enable_profile,
double compilation_time,
double consumed_memory,
size_t profile_lines_limit_count)
{
m_function = func;
m_cldnn_network = network;
m_performance_counters_enabled = enable_timing;
m_profile_enable = enable_profile;
m_compilation_time = compilation_time;
m_consumed_memory = consumed_memory;
m_profile_lines_limit_count = profile_lines_limit_count;
set_parameters_and_results(*func);
} }
bool runtime::intelgpu::IntelGPUExecutable::call(const vector<shared_ptr<runtime::Tensor>>& outputs, bool runtime::intelgpu::IntelGPUExecutable::call(const vector<shared_ptr<runtime::Tensor>>& outputs,
...@@ -1836,9 +1856,7 @@ bool runtime::intelgpu::IntelGPUExecutable::call(const vector<shared_ptr<runtime ...@@ -1836,9 +1856,7 @@ bool runtime::intelgpu::IntelGPUExecutable::call(const vector<shared_ptr<runtime
double mem_call_consumed = 0.0f; double mem_call_consumed = 0.0f;
stopwatch timer_call; stopwatch timer_call;
FunctionInstance& instance = m_function_instance; if (m_cldnn_network == nullptr)
shared_ptr<Function> func = instance.m_function;
if (instance.ocl_network == nullptr)
{ {
throw runtime_error("compile() must be called before call()."); throw runtime_error("compile() must be called before call().");
} }
...@@ -1849,8 +1867,6 @@ bool runtime::intelgpu::IntelGPUExecutable::call(const vector<shared_ptr<runtime ...@@ -1849,8 +1867,6 @@ bool runtime::intelgpu::IntelGPUExecutable::call(const vector<shared_ptr<runtime
timer_call.start(); timer_call.start();
} }
shared_ptr<cldnn::network> network = instance.ocl_network;
// Process input parameters. Correctness of parameters was validated by validate_call. // Process input parameters. Correctness of parameters was validated by validate_call.
// Since we have no correlation between Function::m_parameters and inputs, there is // Since we have no correlation between Function::m_parameters and inputs, there is
// we try to match them by index number in vectors. // we try to match them by index number in vectors.
...@@ -1860,18 +1876,18 @@ bool runtime::intelgpu::IntelGPUExecutable::call(const vector<shared_ptr<runtime ...@@ -1860,18 +1876,18 @@ bool runtime::intelgpu::IntelGPUExecutable::call(const vector<shared_ptr<runtime
static_pointer_cast<runtime::intelgpu::IntelGPUTensorView>(inputs[i]); static_pointer_cast<runtime::intelgpu::IntelGPUTensorView>(inputs[i]);
const ParameterVector& input_params = get_parameters(); const ParameterVector& input_params = get_parameters();
const string& tensor_name = input_params[i]->get_output_tensor().get_name(); const string& tensor_name = input_params[i]->get_output_tensor().get_name();
network->set_input_data(tensor_name, *tv->get_data_ptr()); m_cldnn_network->set_input_data(tensor_name, *tv->get_data_ptr());
} }
// Execute network // Execute network
map<cldnn::primitive_id, cldnn::network_output> result = network->execute(); map<cldnn::primitive_id, cldnn::network_output> result = m_cldnn_network->execute();
// Process output parameters. Correctness of parameters was validated by validate_call. // Process output parameters. Correctness of parameters was validated by validate_call.
// Since we have no correlation between Function::m_results and outputs, there is // Since we have no correlation between Function::m_results and outputs, there is
// we try to match them by index number in vectors. // we try to match them by index number in vectors.
for (size_t i = 0; i < func->get_output_size(); i++) for (size_t i = 0; i < m_function->get_output_size(); i++)
{ {
const shared_ptr<Node>& dst_node = func->get_output_op(i); const shared_ptr<Node>& dst_node = m_function->get_output_op(i);
const size_t dst_shape_size = shape_size(dst_node->get_shape()); const size_t dst_shape_size = shape_size(dst_node->get_shape());
// We should not touch destination memory if it is not existed // We should not touch destination memory if it is not existed
...@@ -1885,7 +1901,7 @@ bool runtime::intelgpu::IntelGPUExecutable::call(const vector<shared_ptr<runtime ...@@ -1885,7 +1901,7 @@ bool runtime::intelgpu::IntelGPUExecutable::call(const vector<shared_ptr<runtime
const string& tensor_name = get_input_name(dst_node); const string& tensor_name = get_input_name(dst_node);
auto result_memory = result.at(tensor_name).get_memory().pointer<char>(); auto result_memory = result.at(tensor_name).get_memory().pointer<char>();
memory_size_check(result_memory.size(), dst_node, func->get_name()); memory_size_check(result_memory.size(), dst_node, m_function->get_name());
ngraph_res->write(result_memory.data(), 0, result_memory.size()); ngraph_res->write(result_memory.data(), 0, result_memory.size());
} }
...@@ -1895,16 +1911,17 @@ bool runtime::intelgpu::IntelGPUExecutable::call(const vector<shared_ptr<runtime ...@@ -1895,16 +1911,17 @@ bool runtime::intelgpu::IntelGPUExecutable::call(const vector<shared_ptr<runtime
timer_call.stop(); timer_call.stop();
mem_call_consumed = get_max_memory_rss() - mem_call_consumed; mem_call_consumed = get_max_memory_rss() - mem_call_consumed;
print_call_performance(network, print_call_performance(m_cldnn_network,
func, m_function,
instance.m_compilation_time, m_compilation_time,
timer_call.get_milliseconds(), timer_call.get_milliseconds(),
instance.m_consumed_memory, m_consumed_memory,
mem_call_consumed, mem_call_consumed,
get_max_memory_rss()); get_max_memory_rss());
// Output compile time only once // Output compile time only once
instance.m_compilation_time = 0.0; m_compilation_time = 0.0;
m_consumed_memory = 0.0;
} }
return true; return true;
...@@ -1912,11 +1929,11 @@ bool runtime::intelgpu::IntelGPUExecutable::call(const vector<shared_ptr<runtime ...@@ -1912,11 +1929,11 @@ bool runtime::intelgpu::IntelGPUExecutable::call(const vector<shared_ptr<runtime
void runtime::intelgpu::IntelGPUBackend::remove_compiled_function(shared_ptr<Executable> exec) void runtime::intelgpu::IntelGPUBackend::remove_compiled_function(shared_ptr<Executable> exec)
{ {
for (auto it = ocl_networks.begin(); it != ocl_networks.end(); ++it) for (auto it = cldnn_networks.begin(); it != cldnn_networks.end(); ++it)
{ {
if (it->second == exec) if (it->second == exec)
{ {
ocl_networks.erase(it); cldnn_networks.erase(it);
break; break;
} }
} }
...@@ -1949,17 +1966,15 @@ vector<runtime::PerformanceCounter> ...@@ -1949,17 +1966,15 @@ vector<runtime::PerformanceCounter>
runtime::intelgpu::IntelGPUExecutable::get_performance_data() const runtime::intelgpu::IntelGPUExecutable::get_performance_data() const
{ {
vector<runtime::PerformanceCounter> rc; vector<runtime::PerformanceCounter> rc;
const shared_ptr<cldnn::network> network = m_function_instance.ocl_network;
shared_ptr<Function> func = m_function_instance.m_function;
if (network != nullptr && m_function_instance.m_performance_counters_enabled) if (m_cldnn_network != nullptr && m_performance_counters_enabled)
{ {
const map<cldnn::primitive_id, cldnn::event>& primitives = const map<cldnn::primitive_id, cldnn::event>& primitives =
network->get_executed_primitives(); m_cldnn_network->get_executed_primitives();
for (const auto& p : primitives) for (const auto& p : primitives)
{ {
// Let's generate the primitive name that matches to the name in Function // Let's generate the primitive name that matches to the name in Function
const string primitive_name = convert_cldnn_names(func, p.first); const string primitive_name = convert_cldnn_names(m_function, p.first);
size_t usec = 0; size_t usec = 0;
for (const auto& q : p.second.get_profiling_info()) for (const auto& q : p.second.get_profiling_info())
{ {
......
...@@ -55,8 +55,8 @@ public: ...@@ -55,8 +55,8 @@ public:
bool is_supported_property(const Property prop) const override; bool is_supported_property(const Property prop) const override;
private: private:
std::shared_ptr<cldnn::engine> ocl_engine; std::shared_ptr<cldnn::engine> cldnn_engine;
std::map<std::shared_ptr<Function>, std::shared_ptr<runtime::Executable>> ocl_networks; std::map<std::shared_ptr<Function>, std::shared_ptr<runtime::Executable>> cldnn_networks;
bool m_profile_enable = false; bool m_profile_enable = false;
long m_profile_lines_limit_count = 10; long m_profile_lines_limit_count = 10;
...@@ -66,38 +66,32 @@ private: ...@@ -66,38 +66,32 @@ private:
bool m_function_cache_disabled = false; bool m_function_cache_disabled = false;
bool m_disable_backend_optimizations = false; bool m_disable_backend_optimizations = false;
std::string m_cldnn_dump_dir = std::string("intelgpu_codegen"); std::string m_cldnn_dump_dir = std::string("intelgpu_codegen");
std::string delim = std::string(":");
}; };
class ngraph::runtime::intelgpu::IntelGPUExecutable : public runtime::Executable class ngraph::runtime::intelgpu::IntelGPUExecutable : public runtime::Executable
{ {
public: public:
IntelGPUExecutable(std::shared_ptr<Function> func, bool enable_timing); IntelGPUExecutable(std::shared_ptr<Function> func,
std::shared_ptr<cldnn::network> network,
bool enable_timing,
bool enable_profile,
double compilation_time,
double consumed_memory,
size_t profile_lines_limit_count);
bool call(const std::vector<std::shared_ptr<runtime::Tensor>>& outputs, bool call(const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) override; const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) override;
std::vector<PerformanceCounter> get_performance_data() const override; std::vector<PerformanceCounter> get_performance_data() const override;
private: private:
class FunctionInstance std::shared_ptr<Function> m_function;
{ std::shared_ptr<cldnn::network> m_cldnn_network = nullptr;
public:
std::shared_ptr<cldnn::network> ocl_network = nullptr;
bool m_performance_counters_enabled = false; bool m_performance_counters_enabled = false;
bool m_profile_enable = false;
double m_compilation_time = 0.0; double m_compilation_time = 0.0;
double m_consumed_memory = 0.0; double m_consumed_memory = 0.0;
std::shared_ptr<Function> m_function;
} m_function_instance;
bool m_profile_enable = false;
long m_profile_lines_limit_count = 10; long m_profile_lines_limit_count = 10;
bool m_dump_graph_enable = false;
bool m_cldnn_graph_optimize = true;
bool m_cldnn_dump_enable = false;
bool m_function_cache_disabled = false;
bool m_disable_backend_optimizations = false;
std::shared_ptr<cldnn::engine> ocl_engine;
std::string m_cldnn_dump_dir = std::string("intelgpu_codegen");
std::string delim = std::string(":"); std::string delim = std::string(":");
// Statistic related things // Statistic related things
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment