Commit 2c7cacd2 authored by Jayaram Bobba's avatar Jayaram Bobba Committed by Scott Cyphers

Add support for perf counter collection in DEX (#1831)

* Add support for perf counter collection in DEX

* Addressed PR feedback
parent a20c710b
...@@ -78,9 +78,7 @@ bool runtime::cpu::CPU_Backend::compile(shared_ptr<Function> func) ...@@ -78,9 +78,7 @@ bool runtime::cpu::CPU_Backend::compile(shared_ptr<Function> func)
if (instance.m_external_function == nullptr) if (instance.m_external_function == nullptr)
{ {
instance.m_external_function = make_shared<CPU_ExternalFunction>(func); instance.m_external_function = make_shared<CPU_ExternalFunction>(func);
#if !defined(NGRAPH_DEX_ONLY)
instance.m_external_function->m_emit_timing = instance.m_performance_counters_enabled; instance.m_external_function->m_emit_timing = instance.m_performance_counters_enabled;
#endif
auto cf = instance.m_external_function->make_call_frame(); auto cf = instance.m_external_function->make_call_frame();
instance.m_call_frame = dynamic_pointer_cast<CPU_CallFrame>(cf); instance.m_call_frame = dynamic_pointer_cast<CPU_CallFrame>(cf);
} }
...@@ -109,8 +107,6 @@ void runtime::cpu::CPU_Backend::remove_compiled_function(shared_ptr<Function> fu ...@@ -109,8 +107,6 @@ void runtime::cpu::CPU_Backend::remove_compiled_function(shared_ptr<Function> fu
m_function_map.erase(func); m_function_map.erase(func);
} }
#if !defined(NGRAPH_DEX_ONLY)
void runtime::cpu::CPU_Backend::enable_performance_data(shared_ptr<Function> func, bool enable) void runtime::cpu::CPU_Backend::enable_performance_data(shared_ptr<Function> func, bool enable)
{ {
FunctionInstance& instance = m_function_map[func]; FunctionInstance& instance = m_function_map[func];
...@@ -131,28 +127,10 @@ vector<runtime::PerformanceCounter> ...@@ -131,28 +127,10 @@ vector<runtime::PerformanceCounter>
const FunctionInstance& instance = it->second; const FunctionInstance& instance = it->second;
if (instance.m_external_function != nullptr) if (instance.m_external_function != nullptr)
{ {
auto* engine = instance.m_external_function->m_execution_engine.get(); rc.insert(rc.end(),
if (engine) instance.m_external_function->get_perf_counters().begin(),
{ instance.m_external_function->get_perf_counters().end());
auto get_count = engine->find_function<size_t()>("get_debug_timer_count");
auto get_name = engine->find_function<const char*(size_t)>("get_debug_timer_name");
auto get_microseconds =
engine->find_function<size_t(size_t)>("get_debug_timer_microseconds");
auto get_call_count =
engine->find_function<size_t(size_t)>("get_debug_timer_call_count");
if (get_count && get_name && get_microseconds && get_call_count)
{
size_t count = get_count();
for (size_t i = 0; i < count; i++)
{
rc.push_back({get_name(i), get_microseconds(i), get_call_count(i)});
}
}
}
} }
} }
return rc; return rc;
} }
#endif
...@@ -53,11 +53,9 @@ namespace ngraph ...@@ -53,11 +53,9 @@ namespace ngraph
void remove_compiled_function(std::shared_ptr<Function> func) override; void remove_compiled_function(std::shared_ptr<Function> func) override;
#if !defined(NGRAPH_DEX_ONLY)
void enable_performance_data(std::shared_ptr<Function> func, bool enable) override; void enable_performance_data(std::shared_ptr<Function> func, bool enable) override;
std::vector<PerformanceCounter> std::vector<PerformanceCounter>
get_performance_data(std::shared_ptr<Function> func) const override; get_performance_data(std::shared_ptr<Function> func) const override;
#endif
private: private:
class FunctionInstance class FunctionInstance
......
...@@ -184,13 +184,11 @@ runtime::cpu::CPU_ExternalFunction::CPU_ExternalFunction( ...@@ -184,13 +184,11 @@ runtime::cpu::CPU_ExternalFunction::CPU_ExternalFunction(
, m_release_function(release_function) , m_release_function(release_function)
, m_use_tbb(std::getenv("NGRAPH_CPU_USE_TBB") != nullptr) , m_use_tbb(std::getenv("NGRAPH_CPU_USE_TBB") != nullptr)
, m_compiled_function(nullptr) , m_compiled_function(nullptr)
#if !defined(NGRAPH_DEX_ONLY)
, m_is_compiled(false)
, m_emit_timing(false) , m_emit_timing(false)
#endif
, m_function_name(function->get_name()) , m_function_name(function->get_name())
, m_is_built(false) , m_is_built(false)
#if !defined(NGRAPH_DEX_ONLY) #if !defined(NGRAPH_DEX_ONLY)
, m_is_compiled(false)
, m_direct_execution(!std::getenv("NGRAPH_CODEGEN")) , m_direct_execution(!std::getenv("NGRAPH_CODEGEN"))
#else #else
, m_direct_execution(true) , m_direct_execution(true)
...@@ -415,6 +413,12 @@ void runtime::cpu::CPU_ExternalFunction::compile() ...@@ -415,6 +413,12 @@ void runtime::cpu::CPU_ExternalFunction::compile()
writer << "// Generated by the nGraph CPU backend\n"; writer << "// Generated by the nGraph CPU backend\n";
if (m_use_tbb) if (m_use_tbb)
{ {
if (runtime::cpu::IsTracingEnabled() || m_emit_timing)
{
throw ngraph_error(
"CPU Backend: Tracing and performance breakdowns might not be accurate with TBB "
"enabled due to concurrent graph execution");
}
writer << "#undef __TBB_PREVIEW_LIGHTWEIGHT_POLICY \n"; writer << "#undef __TBB_PREVIEW_LIGHTWEIGHT_POLICY \n";
writer << "#define __TBB_PREVIEW_LIGHTWEIGHT_POLICY 1\n"; writer << "#define __TBB_PREVIEW_LIGHTWEIGHT_POLICY 1\n";
writer << "#include <tbb/flow_graph.h>"; writer << "#include <tbb/flow_graph.h>";
...@@ -1150,6 +1154,14 @@ void runtime::cpu::CPU_ExternalFunction::build() ...@@ -1150,6 +1154,14 @@ void runtime::cpu::CPU_ExternalFunction::build()
{ {
return; return;
} }
if (m_use_tbb && (runtime::cpu::IsTracingEnabled() || m_emit_timing))
{
throw ngraph_error(
"CPU Backend: Tracing and performance breakdowns might not be accurate with TBB "
"enabled due to concurrent graph execution");
}
// stream writer to dump the debug manifest for the DEX // stream writer to dump the debug manifest for the DEX
static const string s_debug_dir = "cpu_codegen"; static const string s_debug_dir = "cpu_codegen";
static StaticInitializers s_static_initializers(s_debug_dir); static StaticInitializers s_static_initializers(s_debug_dir);
...@@ -1353,6 +1365,8 @@ void runtime::cpu::CPU_ExternalFunction::build() ...@@ -1353,6 +1365,8 @@ void runtime::cpu::CPU_ExternalFunction::build()
enables.emplace_back(enable); enables.emplace_back(enable);
enable_nodename_list.emplace_back(make_pair(enable, node->get_name())); enable_nodename_list.emplace_back(make_pair(enable, node->get_name()));
m_perf_counters.emplace_back(node->get_name().c_str(), 0, 0);
} }
if ((std::getenv("NGRAPH_DEX_DEBUG") != nullptr)) if ((std::getenv("NGRAPH_DEX_DEBUG") != nullptr))
...@@ -1419,7 +1433,7 @@ void runtime::cpu::CPU_ExternalFunction::build() ...@@ -1419,7 +1433,7 @@ void runtime::cpu::CPU_ExternalFunction::build()
assert(m_op_attrs.size() == functors.size()); assert(m_op_attrs.size() == functors.size());
executor = [&](CPURuntimeContext* ctx, vector<void*>& inputs, vector<void*>& outputs) { executor = [&](CPURuntimeContext* ctx, vector<void*>& inputs, vector<void*>& outputs) {
cpu::Timestamp start_ts; cpu::Timestamp start_ts, end_ts;
int profiler_count = 0; int profiler_count = 0;
if (ctx->first_iteration) if (ctx->first_iteration)
...@@ -1458,30 +1472,48 @@ void runtime::cpu::CPU_ExternalFunction::build() ...@@ -1458,30 +1472,48 @@ void runtime::cpu::CPU_ExternalFunction::build()
auto it = enable_nodename_list.begin(); auto it = enable_nodename_list.begin();
for (const auto& p : enables) for (const auto& p : enables)
{ {
auto index = profiler_count++;
tbb::flow::continue_node<tbb::flow::continue_msg, tbb::flow::lightweight>* tbb::flow::continue_node<tbb::flow::continue_msg, tbb::flow::lightweight>*
flowgraph_node = new tbb::flow::continue_node<tbb::flow::continue_msg, flowgraph_node = new tbb::flow::continue_node<tbb::flow::continue_msg,
tbb::flow::lightweight>( tbb::flow::lightweight>(
*(ctx->G), [&](const tbb::flow::continue_msg& msg) { *(ctx->G), [&, index](const tbb::flow::continue_msg& msg) {
if (p(ctx) || ctx->first_iteration) if (p(ctx) || ctx->first_iteration)
{ {
if (runtime::cpu::IsTracingEnabled()) if (runtime::cpu::IsTracingEnabled() || m_emit_timing)
{ {
start_ts = cpu::Clock::now(); start_ts = cpu::Clock::now();
} }
(*functor)(ctx); (*functor)(ctx);
if (runtime::cpu::IsTracingEnabled()) if (runtime::cpu::IsTracingEnabled() || m_emit_timing)
{ {
ctx->op_durations[profiler_count++] = end_ts = cpu::Clock::now();
(std::chrono::duration_cast<cpu::Timescale>(
cpu::Clock::now() - start_ts)) if (runtime::cpu::IsTracingEnabled())
.count(); {
ctx->op_durations[index] =
(std::chrono::duration_cast<cpu::Timescale>(
end_ts - start_ts))
.count();
}
if (m_emit_timing)
{
m_perf_counters[index].m_total_microseconds +=
std::chrono::duration_cast<
std::chrono::microseconds>(end_ts - start_ts)
.count();
m_perf_counters[index].m_call_count++;
}
} }
} }
else else
{ {
if (runtime::cpu::IsTracingEnabled()) if (runtime::cpu::IsTracingEnabled())
{ {
ctx->op_durations[profiler_count++] = 0; ctx->op_durations[index] = 0;
}
if (m_emit_timing)
{
m_perf_counters[index].m_call_count++;
} }
} }
std::advance(functor, 1); std::advance(functor, 1);
...@@ -1535,28 +1567,45 @@ void runtime::cpu::CPU_ExternalFunction::build() ...@@ -1535,28 +1567,45 @@ void runtime::cpu::CPU_ExternalFunction::build()
{ {
for (const auto& p : enables) for (const auto& p : enables)
{ {
auto index = profiler_count++;
if (p(ctx) || ctx->first_iteration) if (p(ctx) || ctx->first_iteration)
{ {
// Each Op will have exactly one functor, start the clock before the exceution of functor // Each Op will have exactly one functor, start the clock before the exceution of functor
// and collect the profiler_count once the execution complets // and collect the profiler_count once the execution complets
if (runtime::cpu::IsTracingEnabled()) if (runtime::cpu::IsTracingEnabled() || m_emit_timing)
{ {
start_ts = cpu::Clock::now(); start_ts = cpu::Clock::now();
} }
(*functor)(ctx); (*functor)(ctx);
if (runtime::cpu::IsTracingEnabled()) if (runtime::cpu::IsTracingEnabled() || m_emit_timing)
{ {
ctx->op_durations[profiler_count++] = end_ts = cpu::Clock::now();
(std::chrono::duration_cast<cpu::Timescale>(cpu::Clock::now() -
start_ts)) if (runtime::cpu::IsTracingEnabled())
.count(); {
ctx->op_durations[index] =
(std::chrono::duration_cast<cpu::Timescale>(end_ts - start_ts))
.count();
}
if (m_emit_timing)
{
m_perf_counters[index].m_total_microseconds +=
std::chrono::duration_cast<std::chrono::microseconds>(end_ts -
start_ts)
.count();
m_perf_counters[index].m_call_count++;
}
} }
} }
else else
{ {
if (runtime::cpu::IsTracingEnabled()) if (runtime::cpu::IsTracingEnabled())
{ {
ctx->op_durations[profiler_count++] = 0; ctx->op_durations[index] = 0;
}
if (m_emit_timing)
{
m_perf_counters[index].m_call_count++;
} }
} }
std::advance(functor, 1); std::advance(functor, 1);
...@@ -1622,6 +1671,49 @@ const runtime::cpu::LayoutDescriptorPtrs& ...@@ -1622,6 +1671,49 @@ const runtime::cpu::LayoutDescriptorPtrs&
return result_layout_descriptors; return result_layout_descriptors;
} }
const vector<runtime::PerformanceCounter>& runtime::cpu::CPU_ExternalFunction::get_perf_counters()
{
if (m_direct_execution)
{
return m_perf_counters;
}
#if !defined(NGRAPH_DEX_ONLY)
// Codegen. Retrieve perf counters from compiled module
if (m_execution_engine)
{
auto get_count = m_execution_engine->find_function<size_t()>("get_debug_timer_count");
auto get_name =
m_execution_engine->find_function<const char*(size_t)>("get_debug_timer_name");
auto get_microseconds =
m_execution_engine->find_function<size_t(size_t)>("get_debug_timer_microseconds");
auto get_call_count =
m_execution_engine->find_function<size_t(size_t)>("get_debug_timer_call_count");
if (get_count && get_name && get_microseconds && get_call_count)
{
size_t count = get_count();
if (m_perf_counters.size() == 0)
{
for (size_t i = 0; i < count; i++)
{
m_perf_counters.push_back(
{get_name(i), get_microseconds(i), get_call_count(i)});
}
}
else
{
for (size_t i = 0; i < count; i++)
{
m_perf_counters[i].m_total_microseconds = get_microseconds(i);
m_perf_counters[i].m_call_count = get_call_count(i);
}
}
}
}
return m_perf_counters;
#endif
}
void runtime::cpu::CPU_ExternalFunction::write_to_file(const std::string& code, void runtime::cpu::CPU_ExternalFunction::write_to_file(const std::string& code,
const std::string& directory, const std::string& directory,
const std::string& filename) const std::string& filename)
......
...@@ -45,6 +45,7 @@ ...@@ -45,6 +45,7 @@
#include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp" #include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
#include "ngraph/runtime/cpu/cpu_tensor_view_wrapper.hpp" #include "ngraph/runtime/cpu/cpu_tensor_view_wrapper.hpp"
#include "ngraph/runtime/cpu/mkldnn_emitter.hpp" #include "ngraph/runtime/cpu/mkldnn_emitter.hpp"
#include "ngraph/runtime/performance_counter.hpp"
namespace ngraph namespace ngraph
{ {
...@@ -138,6 +139,8 @@ namespace ngraph ...@@ -138,6 +139,8 @@ namespace ngraph
const std::string& directory, const std::string& directory,
const std::string& filename); const std::string& filename);
const std::vector<PerformanceCounter>& get_perf_counters();
#if defined(NGRAPH_HALIDE) #if defined(NGRAPH_HALIDE)
std::unordered_map<std::string, Halide::Func>& get_halide_functions() std::unordered_map<std::string, Halide::Func>& get_halide_functions()
{ {
...@@ -182,7 +185,7 @@ namespace ngraph ...@@ -182,7 +185,7 @@ namespace ngraph
std::string output_name, std::string output_name,
bool dex); bool dex);
bool computes_result(Node* node); bool computes_result(Node* node);
void release_function() { m_function = nullptr; }
#if !defined(NGRAPH_DEX_ONLY) #if !defined(NGRAPH_DEX_ONLY)
void emit_debug_function_entry(codegen::CodeWriter& writer, void emit_debug_function_entry(codegen::CodeWriter& writer,
Node* node, Node* node,
...@@ -204,22 +207,9 @@ namespace ngraph ...@@ -204,22 +207,9 @@ namespace ngraph
std::string emit_op_as_function(const Node&, const std::string& function_name); std::string emit_op_as_function(const Node&, const std::string& function_name);
std::string strip_comments(const std::string&); std::string strip_comments(const std::string&);
#endif
void release_function() { m_function = nullptr; }
std::shared_ptr<ngraph::Function> m_function;
bool m_release_function;
bool m_use_tbb;
EntryPoint m_compiled_function;
std::unordered_map<std::string, std::string> m_variable_name_map;
#if !defined(NGRAPH_DEX_ONLY)
bool m_is_compiled; bool m_is_compiled;
std::unique_ptr<codegen::Compiler> m_compiler; std::unique_ptr<codegen::Compiler> m_compiler;
std::unique_ptr<codegen::ExecutionEngine> m_execution_engine; std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
bool m_emit_timing;
std::map<std::string, size_t> m_name_index_map; std::map<std::string, size_t> m_name_index_map;
...@@ -227,8 +217,17 @@ namespace ngraph ...@@ -227,8 +217,17 @@ namespace ngraph
// Constant ops we need to keep a list of shared_ptr to each Constant // Constant ops we need to keep a list of shared_ptr to each Constant
// so they don't get freed before we are done with them // so they don't get freed before we are done with them
std::vector<std::shared_ptr<Node>> m_active_constants; std::vector<std::shared_ptr<Node>> m_active_constants;
#endif #endif
std::shared_ptr<ngraph::Function> m_function;
bool m_release_function;
bool m_emit_timing;
bool m_use_tbb;
EntryPoint m_compiled_function;
std::unordered_map<std::string, std::string> m_variable_name_map;
std::unordered_map<std::string, CPUTensorRole> m_tensor_roles; std::unordered_map<std::string, CPUTensorRole> m_tensor_roles;
LayoutDescriptorPtrs parameter_layout_descriptors; LayoutDescriptorPtrs parameter_layout_descriptors;
...@@ -257,6 +256,7 @@ namespace ngraph ...@@ -257,6 +256,7 @@ namespace ngraph
std::unordered_map<std::string, std::shared_ptr<CPU_ExternalFunction>> callees; std::unordered_map<std::string, std::shared_ptr<CPU_ExternalFunction>> callees;
bool m_is_built; bool m_is_built;
bool m_direct_execution; bool m_direct_execution;
std::vector<runtime::PerformanceCounter> m_perf_counters;
#if defined(NGRAPH_HALIDE) #if defined(NGRAPH_HALIDE)
std::unordered_map<std::string, Halide::Func> halide_functions; std::unordered_map<std::string, Halide::Func> halide_functions;
......
...@@ -34,9 +34,11 @@ namespace ngraph ...@@ -34,9 +34,11 @@ namespace ngraph
} }
const std::string& name() const { return m_name; } const std::string& name() const { return m_name; }
size_t total_microseconds() const { return m_total_microseconds; } size_t total_microseconds() const { return m_total_microseconds; }
size_t microseconds() const { return m_total_microseconds / m_call_count; } size_t microseconds() const
{
return m_call_count == 0 ? 0 : m_total_microseconds / m_call_count;
}
size_t call_count() const { return m_call_count; } size_t call_count() const { return m_call_count; }
private:
std::string m_name; std::string m_name;
size_t m_total_microseconds; size_t m_total_microseconds;
size_t m_call_count; size_t m_call_count;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment