Commit 05062078 authored by Robert Kimball's avatar Robert Kimball

Merge branch 'bob/backend_api3' of https://github.com/NervanaSystems/ngraph into bob/backend_api3

parents 0a11fa24 e873255d
...@@ -144,6 +144,10 @@ bool runtime::Backend::is_supported_property(const Property prop) const ...@@ -144,6 +144,10 @@ bool runtime::Backend::is_supported_property(const Property prop) const
return false; return false;
} }
void runtime::Backend::remove_compiled_function(std::shared_ptr<Executable> exec)
{
}
bool runtime::Backend::call_with_validate( bool runtime::Backend::call_with_validate(
std::shared_ptr<Executable> exec, std::shared_ptr<Executable> exec,
const std::vector<std::shared_ptr<runtime::Tensor>>& outputs, const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
......
...@@ -101,6 +101,8 @@ public: ...@@ -101,6 +101,8 @@ public:
/// \returns true if the property is supported, false otherwise. /// \returns true if the property is supported, false otherwise.
virtual bool is_supported_property(const Property prop) const; virtual bool is_supported_property(const Property prop) const;
virtual void remove_compiled_function(std::shared_ptr<Executable> exec);
/// The following methods are temporary hacks to reduce the number of changes in this PR /// The following methods are temporary hacks to reduce the number of changes in this PR
/// They will be removed in a follow-on PR /// They will be removed in a follow-on PR
bool call_with_validate(std::shared_ptr<Executable> handle, bool call_with_validate(std::shared_ptr<Executable> handle,
......
...@@ -121,6 +121,18 @@ bool runtime::cpu::CPU_Executable::call(const vector<shared_ptr<runtime::Tensor> ...@@ -121,6 +121,18 @@ bool runtime::cpu::CPU_Executable::call(const vector<shared_ptr<runtime::Tensor>
return rc; return rc;
} }
void runtime::cpu::CPU_Backend::remove_compiled_function(shared_ptr<Executable> exec)
{
for (auto it = m_exec_map.begin(); it != m_exec_map.end(); ++it)
{
if (it->second == exec)
{
m_exec_map.erase(it);
break;
}
}
}
vector<runtime::PerformanceCounter> runtime::cpu::CPU_Executable::get_performance_data() const vector<runtime::PerformanceCounter> runtime::cpu::CPU_Executable::get_performance_data() const
{ {
vector<runtime::PerformanceCounter> rc; vector<runtime::PerformanceCounter> rc;
......
...@@ -49,6 +49,8 @@ namespace ngraph ...@@ -49,6 +49,8 @@ namespace ngraph
compile(std::shared_ptr<Function> func, compile(std::shared_ptr<Function> func,
bool enable_performance_counters = false) override; bool enable_performance_counters = false) override;
void remove_compiled_function(shared_ptr<Executable> exec) override;
bool is_supported(const Node& node) const override; bool is_supported(const Node& node) const override;
bool is_supported_property(const Property prop) const override; bool is_supported_property(const Property prop) const override;
......
...@@ -414,13 +414,31 @@ shared_ptr<runtime::Tensor> runtime::intelgpu::IntelGPUBackend::create_tensor( ...@@ -414,13 +414,31 @@ shared_ptr<runtime::Tensor> runtime::intelgpu::IntelGPUBackend::create_tensor(
element_type, shape, *ocl_engine, memory_pointer, this); element_type, shape, *ocl_engine, memory_pointer, this);
} }
runtime::Handle runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func) shared_ptr<runtime::Executable>
runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> func, bool enable_timing)
{ {
FunctionInstance& instance = ocl_networks[func]; shared_ptr<runtime::Executable> rc;
if (instance.ocl_network != nullptr) auto it = ocl_networks.find(func);
if (it != ocl_networks.end())
{ {
return func; rc = it->second;
}
else
{
rc = make_shared<IntelGPUExecutable>(func, enable_timing);
if (!m_function_cache_disabled)
{
ocl_networks.insert({func, rc});
}
} }
return rc;
}
runtime::intelgpu::IntelGPUExecutable::IntelGPUExecutable(shared_ptr<Function> func,
bool enable_timing)
{
FunctionInstance& instance = m_function_instance;
instance.m_function = func;
set<cldnn::primitive_id> func_output_names; set<cldnn::primitive_id> func_output_names;
cldnn::topology topology; cldnn::topology topology;
...@@ -1803,18 +1821,16 @@ runtime::Handle runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function> ...@@ -1803,18 +1821,16 @@ runtime::Handle runtime::intelgpu::IntelGPUBackend::compile(shared_ptr<Function>
instance.m_compilation_time = timer_compile.get_milliseconds(); instance.m_compilation_time = timer_compile.get_milliseconds();
instance.m_consumed_memory = get_max_memory_rss() - mem_before_compile; instance.m_consumed_memory = get_max_memory_rss() - mem_before_compile;
} }
return func;
} }
bool runtime::intelgpu::IntelGPUBackend::call(shared_ptr<Function> func, bool runtime::intelgpu::IntelGPUExecutable::call(const vector<shared_ptr<runtime::Tensor>>& outputs,
const vector<shared_ptr<runtime::Tensor>>& outputs,
const vector<shared_ptr<runtime::Tensor>>& inputs) const vector<shared_ptr<runtime::Tensor>>& inputs)
{ {
double mem_call_consumed = 0.0f; double mem_call_consumed = 0.0f;
stopwatch timer_call; stopwatch timer_call;
FunctionInstance& instance = ocl_networks[func]; FunctionInstance& instance = m_function_instance;
shared_ptr<Function> func = instance.m_function;
if (instance.ocl_network == nullptr) if (instance.ocl_network == nullptr)
{ {
throw runtime_error("compile() must be called before call()."); throw runtime_error("compile() must be called before call().");
...@@ -1835,7 +1851,7 @@ bool runtime::intelgpu::IntelGPUBackend::call(shared_ptr<Function> func, ...@@ -1835,7 +1851,7 @@ bool runtime::intelgpu::IntelGPUBackend::call(shared_ptr<Function> func,
{ {
shared_ptr<runtime::intelgpu::IntelGPUTensorView> tv = shared_ptr<runtime::intelgpu::IntelGPUTensorView> tv =
static_pointer_cast<runtime::intelgpu::IntelGPUTensorView>(inputs[i]); static_pointer_cast<runtime::intelgpu::IntelGPUTensorView>(inputs[i]);
const ParameterVector& input_params = func->get_parameters(); const ParameterVector& input_params = get_parameters();
const string& tensor_name = input_params[i]->get_output_tensor().get_name(); const string& tensor_name = input_params[i]->get_output_tensor().get_name();
network->set_input_data(tensor_name, *tv->get_data_ptr()); network->set_input_data(tensor_name, *tv->get_data_ptr());
} }
...@@ -1884,29 +1900,19 @@ bool runtime::intelgpu::IntelGPUBackend::call(shared_ptr<Function> func, ...@@ -1884,29 +1900,19 @@ bool runtime::intelgpu::IntelGPUBackend::call(shared_ptr<Function> func,
instance.m_compilation_time = 0.0; instance.m_compilation_time = 0.0;
} }
if (m_function_cache_disabled)
{
remove_compiled_function(func);
}
return true; return true;
} }
void runtime::intelgpu::IntelGPUBackend::remove_compiled_function(shared_ptr<Function> func) void runtime::intelgpu::IntelGPUBackend::remove_compiled_function(shared_ptr<Executable> exec)
{ {
ocl_networks.erase(func); for (auto it = ocl_networks.begin(); it != ocl_networks.end(); ++it)
} {
if (it->second == exec)
void runtime::intelgpu::IntelGPUBackend::enable_performance_data(shared_ptr<Function> func,
bool enable)
{
FunctionInstance& instance = ocl_networks[func];
if (instance.ocl_network != nullptr)
{ {
throw runtime_error("Performance data collection must be enabled prior to compiling."); ocl_networks.erase(it);
break;
}
} }
instance.m_performance_counters_enabled = enable;
} }
// The cldnn::network contains something like "generic_layer_0_Parameter_254_0" names // The cldnn::network contains something like "generic_layer_0_Parameter_254_0" names
...@@ -1933,15 +1939,13 @@ static string convert_cldnn_names(shared_ptr<Function> func, const string& cldnn ...@@ -1933,15 +1939,13 @@ static string convert_cldnn_names(shared_ptr<Function> func, const string& cldnn
} }
vector<runtime::PerformanceCounter> vector<runtime::PerformanceCounter>
runtime::intelgpu::IntelGPUBackend::get_performance_data(shared_ptr<Function> func) const runtime::intelgpu::IntelGPUExecutable::get_performance_data() const
{ {
vector<runtime::PerformanceCounter> rc; vector<runtime::PerformanceCounter> rc;
auto it = ocl_networks.find(func); const shared_ptr<cldnn::network> network = m_function_instance.ocl_network;
if (it != ocl_networks.end()) shared_ptr<Function> func = m_function_instance.m_function;
{
const shared_ptr<cldnn::network> network = it->second.ocl_network;
if (network != nullptr && it->second.m_performance_counters_enabled) if (network != nullptr && m_function_instance.m_performance_counters_enabled)
{ {
const map<cldnn::primitive_id, cldnn::event>& primitives = const map<cldnn::primitive_id, cldnn::event>& primitives =
network->get_executed_primitives(); network->get_executed_primitives();
...@@ -1964,7 +1968,6 @@ vector<runtime::PerformanceCounter> ...@@ -1964,7 +1968,6 @@ vector<runtime::PerformanceCounter>
rc.push_back(perf_counter); rc.push_back(perf_counter);
} }
} }
}
return rc; return rc;
} }
...@@ -1981,7 +1984,7 @@ static Node* get_node_by_name(const shared_ptr<Function> func, const string& nam ...@@ -1981,7 +1984,7 @@ static Node* get_node_by_name(const shared_ptr<Function> func, const string& nam
return nullptr; return nullptr;
} }
void runtime::intelgpu::IntelGPUBackend::print_call_performance( void runtime::intelgpu::IntelGPUExecutable::print_call_performance(
const shared_ptr<cldnn::network> network, const shared_ptr<cldnn::network> network,
const shared_ptr<Function> func, const shared_ptr<Function> func,
double time_compile, double time_compile,
......
...@@ -31,6 +31,7 @@ namespace ngraph ...@@ -31,6 +31,7 @@ namespace ngraph
namespace intelgpu namespace intelgpu
{ {
class IntelGPUBackend; class IntelGPUBackend;
class IntelGPUExecutable;
} }
} }
} }
...@@ -47,18 +48,35 @@ public: ...@@ -47,18 +48,35 @@ public:
std::shared_ptr<ngraph::runtime::Tensor> std::shared_ptr<ngraph::runtime::Tensor>
create_tensor(const ngraph::element::Type& element_type, const Shape& shape) override; create_tensor(const ngraph::element::Type& element_type, const Shape& shape) override;
Handle compile(std::shared_ptr<Function> func) override; std::shared_ptr<runtime::Executable> compile(std::shared_ptr<Function> func,
bool enable_timing = false) override;
void remove_compiled_function(std::shared_ptr<runtime::Executable> exec) override;
bool call(std::shared_ptr<Function> func, bool is_supported_property(const Property prop) const override;
const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) override;
void remove_compiled_function(std::shared_ptr<Function> func) override; private:
void enable_performance_data(std::shared_ptr<Function> func, bool enable) override; std::shared_ptr<cldnn::engine> ocl_engine;
std::vector<PerformanceCounter> std::map<std::shared_ptr<Function>, std::shared_ptr<runtime::Executable>> ocl_networks;
get_performance_data(std::shared_ptr<Function> func) const override;
bool is_supported_property(const Property prop) const override; bool m_profile_enable = false;
long m_profile_lines_limit_count = 10;
bool m_dump_graph_enable = false;
bool m_cldnn_graph_optimize = true;
bool m_cldnn_dump_enable = false;
bool m_function_cache_disabled = false;
bool m_disable_backend_optimizations = false;
std::string m_cldnn_dump_dir = std::string("intelgpu_codegen");
std::string delim = std::string(":");
};
class ngraph::runtime::intelgpu::IntelGPUExecutable : public runtime::Executable
{
public:
IntelGPUExecutable(std::shared_ptr<Function> func, bool enable_timing);
bool call(const std::vector<std::shared_ptr<runtime::Tensor>>& outputs,
const std::vector<std::shared_ptr<runtime::Tensor>>& inputs) override;
std::vector<PerformanceCounter> get_performance_data() const override;
private: private:
class FunctionInstance class FunctionInstance
...@@ -68,12 +86,19 @@ private: ...@@ -68,12 +86,19 @@ private:
bool m_performance_counters_enabled = false; bool m_performance_counters_enabled = false;
double m_compilation_time = 0.0; double m_compilation_time = 0.0;
double m_consumed_memory = 0.0; double m_consumed_memory = 0.0;
}; std::shared_ptr<Function> m_function;
} m_function_instance;
std::map<std::shared_ptr<Function>, FunctionInstance> ocl_networks;
std::shared_ptr<cldnn::engine> ocl_engine;
bool m_profile_enable = false;
long m_profile_lines_limit_count = 10;
bool m_dump_graph_enable = false;
bool m_cldnn_graph_optimize = true;
bool m_cldnn_dump_enable = false;
bool m_function_cache_disabled = false;
bool m_disable_backend_optimizations = false; bool m_disable_backend_optimizations = false;
std::shared_ptr<cldnn::engine> ocl_engine;
std::string m_cldnn_dump_dir = std::string("intelgpu_codegen");
std::string delim = std::string(":");
// Statistic related things // Statistic related things
void print_call_performance(const std::shared_ptr<cldnn::network> network, void print_call_performance(const std::shared_ptr<cldnn::network> network,
...@@ -83,13 +108,4 @@ private: ...@@ -83,13 +108,4 @@ private:
double mem_compilation_consumed, double mem_compilation_consumed,
double mem_call_consumed, double mem_call_consumed,
double mem_current) const; double mem_current) const;
bool m_profile_enable = false;
long m_profile_lines_limit_count = 10;
bool m_dump_graph_enable = false;
bool m_cldnn_graph_optimize = true;
bool m_cldnn_dump_enable = false;
bool m_function_cache_disabled = false;
std::string m_cldnn_dump_dir = std::string("intelgpu_codegen");
std::string delim = std::string(":");
}; };
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment