Commit 1132afe5 authored by Robert Kimball's avatar Robert Kimball

update intel GPU backend

parent 8214cd39
......@@ -414,13 +414,31 @@ shared_ptr<runtime::Tensor> runtime::intelgpu::IntelGPUBackend::create_tensor(
element_type, shape, *ocl_engine, memory_pointer, this);
}
runtime::Handle runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func)
shared_ptr<runtime::Executable>
runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func, bool enable_timing)
{
FunctionInstance& instance = ocl_networks[func];
if (instance.ocl_network != nullptr)
shared_ptr<runtime::Executable> rc;
auto it = ocl_networks.find(func);
if (it != ocl_networks.end())
{
return func;
rc = it->second;
}
else
{
rc = make_shared<IntelGPUExecutable>(func, enable_timing);
if (!m_function_cache_disabled)
{
ocl_networks.insert({func, rc});
}
}
return rc;
}
runtime::intelgpu::IntelGPUExecutable::IntelGPUExecutable(shared_ptr<Function> func,
bool enable_timing)
{
FunctionInstance& instance = m_function_instance;
instance.m_function = func;
set<cldnn::primitive_id> func_output_names;
cldnn::topology topology;
......@@ -1803,18 +1821,16 @@ runtime::Handle runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function>
instance.m_compilation_time = timer_compile.get_milliseconds();
instance.m_consumed_memory = get_max_memory_rss() - mem_before_compile;
}
return func;
}
bool runtime::intelgpu::IntelGPUBackend::call(shared_ptr<Function> func,
const vector<shared_ptr<runtime::Tensor>>& outputs,
const vector<shared_ptr<runtime::Tensor>>& inputs)
bool runtime::intelgpu::IntelGPUExecutable::call(const vector<shared_ptr<runtime::Tensor>>& outputs,
const vector<shared_ptr<runtime::Tensor>>& inputs)
{
double mem_call_consumed = 0.0f;
stopwatch timer_call;
FunctionInstance& instance = ocl_networks[func];
FunctionInstance& instance = m_function_instance;
shared_ptr<Function> func = instance.m_function;
if (instance.ocl_network == nullptr)
{
throw runtime_error("compile() must be called before call().");
......@@ -1835,7 +1851,7 @@ bool runtime::intelgpu::IntelGPUBackend::call(shared_ptr<Function> func,
{
shared_ptr<runtime::intelgpu::IntelGPUTensorView> tv =
static_pointer_cast<runtime::intelgpu::IntelGPUTensorView>(inputs[i]);
const ParameterVector& input_params = func->get_parameters();
const ParameterVector& input_params = get_parameters();
const string& tensor_name = input_params[i]->get_output_tensor().get_name();
network->set_input_data(tensor_name, *tv->get_data_ptr());
}
......@@ -1884,11 +1900,6 @@ bool runtime::intelgpu::IntelGPUBackend::call(shared_ptr<Function> func,
instance.m_compilation_time = 0.0;
}
if (m_function_cache_disabled)
{
remove_compiled_function(func);
}
return true;
}
......@@ -1897,18 +1908,6 @@ void runtime::intelgpu::IntelGPUBackend::remove_compiled_function(shared_ptr<Fun
ocl_networks.erase(func);
}
void runtime::intelgpu::IntelGPUBackend::enable_performance_data(shared_ptr<Function> func,
bool enable)
{
FunctionInstance& instance = ocl_networks[func];
if (instance.ocl_network != nullptr)
{
throw runtime_error("Performance data collection must be enabled prior to compiling.");
}
instance.m_performance_counters_enabled = enable;
}
// The cldnn::network contains something like "generic_layer_0_Parameter_254_0" names
// This function should return "Parameter_254" from the example above
static string convert_cldnn_names(shared_ptr<Function> func, const string& cldnn_name)
......@@ -1933,36 +1932,33 @@ static string convert_cldnn_names(shared_ptr<Function> func, const string& cldnn
}
vector<runtime::PerformanceCounter>
runtime::intelgpu::IntelGPUBackend::get_performance_data(shared_ptr<Function> func) const
runtime::intelgpu::IntelGPUExecutable::get_performance_data() const
{
vector<runtime::PerformanceCounter> rc;
auto it = ocl_networks.find(func);
if (it != ocl_networks.end())
{
const shared_ptr<cldnn::network> network = it->second.ocl_network;
const shared_ptr<cldnn::network> network = m_function_instance.ocl_network;
shared_ptr<Function> func = m_function_instance.m_function;
if (network != nullptr && it->second.m_performance_counters_enabled)
{
const map<cldnn::primitive_id, cldnn::event>& primitives =
network->get_executed_primitives();
for (const auto& p : primitives)
if (network != nullptr && m_function_instance.m_performance_counters_enabled)
{
const map<cldnn::primitive_id, cldnn::event>& primitives =
network->get_executed_primitives();
for (const auto& p : primitives)
{
// Let's generate the primitive name that matches to the name in Function
const string primitive_name = convert_cldnn_names(func, p.first);
size_t usec = 0;
for (const auto& q : p.second.get_profiling_info())
{
// Let's generate the primitive name that matches to the name in Function
const string primitive_name = convert_cldnn_names(func, p.first);
size_t usec = 0;
for (const auto& q : p.second.get_profiling_info())
if (q.name == string("executing"))
{
if (q.name == string("executing"))
{
usec += chrono::duration_cast<
chrono::duration<size_t, chrono::milliseconds::period>>(
q.value->value())
.count();
}
usec += chrono::duration_cast<
chrono::duration<size_t, chrono::milliseconds::period>>(
q.value->value())
.count();
}
const runtime::PerformanceCounter perf_counter(primitive_name.c_str(), usec, 1);
rc.push_back(perf_counter);
}
const runtime::PerformanceCounter perf_counter(primitive_name.c_str(), usec, 1);
rc.push_back(perf_counter);
}
}
return rc;
......@@ -1981,7 +1977,7 @@ static Node* get_node_by_name(const shared_ptr<Function> func, const string& nam
return nullptr;
}
void runtime::intelgpu::IntelGPUBackend::print_call_performance(
void runtime::intelgpu::IntelGPUExecutable::print_call_performance(
const shared_ptr<cldnn::network> network,
const shared_ptr<Function> func,
double time_compile,
......
......@@ -31,6 +31,7 @@ namespace ngraph
namespace intelgpu
{
class IntelGPUBackend;
class IntelGPUExecutable;
}
}
}
......@@ -47,18 +48,35 @@ public:
std::shared_ptr<ngraph::runtime::Tensor>
create_tensor(const ngraph::element::Type& element_type, const Shape& shape) override;
Handle compile(std::shared_ptr<Function> func) override;
std::shared_ptr<runtime::Executable> compile(std::shared_ptr<Function> func,
bool enable_timing = false) override;
void remove_compiled_function(std::shared_ptr<Function> func);
bool call(std::shared_ptr<Function> func,
const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) override;
bool is_supported_property(const Property prop) const override;
void remove_compiled_function(std::shared_ptr<Function> func) override;
void enable_performance_data(std::shared_ptr<Function> func, bool enable) override;
std::vector<PerformanceCounter>
get_performance_data(std::shared_ptr<Function> func) const override;
private:
std::shared_ptr<cldnn::engine> ocl_engine;
std::map<std::shared_ptr<Function>, std::shared_ptr<runtime::Executable>> ocl_networks;
bool is_supported_property(const Property prop) const override;
bool m_profile_enable = false;
long m_profile_lines_limit_count = 10;
bool m_dump_graph_enable = false;
bool m_cldnn_graph_optimize = true;
bool m_cldnn_dump_enable = false;
bool m_function_cache_disabled = false;
bool m_disable_backend_optimizations = false;
std::string m_cldnn_dump_dir = std::string("intelgpu_codegen");
std::string delim = std::string(":");
};
class ngraph::runtime::intelgpu::IntelGPUExecutable : public runtime::Executable
{
public:
IntelGPUExecutable(std::shared_ptr<Function> func, bool enable_timing);
bool call(const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) override;
std::vector<PerformanceCounter> get_performance_data() const override;
private:
class FunctionInstance
......@@ -68,12 +86,19 @@ private:
bool m_performance_counters_enabled = false;
double m_compilation_time = 0.0;
double m_consumed_memory = 0.0;
};
std::map<std::shared_ptr<Function>, FunctionInstance> ocl_networks;
std::shared_ptr<cldnn::engine> ocl_engine;
std::shared_ptr<Function> m_function;
} m_function_instance;
bool m_profile_enable = false;
long m_profile_lines_limit_count = 10;
bool m_dump_graph_enable = false;
bool m_cldnn_graph_optimize = true;
bool m_cldnn_dump_enable = false;
bool m_function_cache_disabled = false;
bool m_disable_backend_optimizations = false;
std::shared_ptr<cldnn::engine> ocl_engine;
std::string m_cldnn_dump_dir = std::string("intelgpu_codegen");
std::string delim = std::string(":");
// Statistic related things
void print_call_performance(const std::shared_ptr<cldnn::network> network,
......@@ -83,13 +108,4 @@ private:
double mem_compilation_consumed,
double mem_call_consumed,
double mem_current) const;
bool m_profile_enable = false;
long m_profile_lines_limit_count = 10;
bool m_dump_graph_enable = false;
bool m_cldnn_graph_optimize = true;
bool m_cldnn_dump_enable = false;
bool m_function_cache_disabled = false;
std::string m_cldnn_dump_dir = std::string("intelgpu_codegen");
std::string delim = std::string(":");
};
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment