Commit 2c7cacd2 authored by Jayaram Bobba's avatar Jayaram Bobba Committed by Scott Cyphers

Add support for perf counter collection in DEX (#1831)

* Add support for perf counter collection in DEX

* Addressed PR feedback
parent a20c710b
......@@ -78,9 +78,7 @@ bool runtime::cpu::CPU_Backend::compile(shared_ptr<Function> func)
if (instance.m_external_function == nullptr)
{
instance.m_external_function = make_shared<CPU_ExternalFunction>(func);
#if !defined(NGRAPH_DEX_ONLY)
instance.m_external_function->m_emit_timing = instance.m_performance_counters_enabled;
#endif
auto cf = instance.m_external_function->make_call_frame();
instance.m_call_frame = dynamic_pointer_cast<CPU_CallFrame>(cf);
}
......@@ -109,8 +107,6 @@ void runtime::cpu::CPU_Backend::remove_compiled_function(shared_ptr<Function> fu
m_function_map.erase(func);
}
#if !defined(NGRAPH_DEX_ONLY)
void runtime::cpu::CPU_Backend::enable_performance_data(shared_ptr<Function> func, bool enable)
{
FunctionInstance& instance = m_function_map[func];
......@@ -131,28 +127,10 @@ vector<runtime::PerformanceCounter>
const FunctionInstance& instance = it->second;
if (instance.m_external_function != nullptr)
{
auto* engine = instance.m_external_function->m_execution_engine.get();
if (engine)
{
auto get_count = engine->find_function<size_t()>("get_debug_timer_count");
auto get_name = engine->find_function<const char*(size_t)>("get_debug_timer_name");
auto get_microseconds =
engine->find_function<size_t(size_t)>("get_debug_timer_microseconds");
auto get_call_count =
engine->find_function<size_t(size_t)>("get_debug_timer_call_count");
if (get_count && get_name && get_microseconds && get_call_count)
{
size_t count = get_count();
for (size_t i = 0; i < count; i++)
{
rc.push_back({get_name(i), get_microseconds(i), get_call_count(i)});
}
}
}
rc.insert(rc.end(),
instance.m_external_function->get_perf_counters().begin(),
instance.m_external_function->get_perf_counters().end());
}
}
return rc;
}
#endif
......@@ -53,11 +53,9 @@ namespace ngraph
void remove_compiled_function(std::shared_ptr<Function> func) override;
#if !defined(NGRAPH_DEX_ONLY)
void enable_performance_data(std::shared_ptr<Function> func, bool enable) override;
std::vector<PerformanceCounter>
get_performance_data(std::shared_ptr<Function> func) const override;
#endif
private:
class FunctionInstance
......
......@@ -184,13 +184,11 @@ runtime::cpu::CPU_ExternalFunction::CPU_ExternalFunction(
, m_release_function(release_function)
, m_use_tbb(std::getenv("NGRAPH_CPU_USE_TBB") != nullptr)
, m_compiled_function(nullptr)
#if !defined(NGRAPH_DEX_ONLY)
, m_is_compiled(false)
, m_emit_timing(false)
#endif
, m_function_name(function->get_name())
, m_is_built(false)
#if !defined(NGRAPH_DEX_ONLY)
, m_is_compiled(false)
, m_direct_execution(!std::getenv("NGRAPH_CODEGEN"))
#else
, m_direct_execution(true)
......@@ -415,6 +413,12 @@ void runtime::cpu::CPU_ExternalFunction::compile()
writer << "// Generated by the nGraph CPU backend\n";
if (m_use_tbb)
{
if (runtime::cpu::IsTracingEnabled() || m_emit_timing)
{
throw ngraph_error(
"CPU Backend: Tracing and performance breakdowns might not be accurate with TBB "
"enabled due to concurrent graph execution");
}
writer << "#undef __TBB_PREVIEW_LIGHTWEIGHT_POLICY \n";
writer << "#define __TBB_PREVIEW_LIGHTWEIGHT_POLICY 1\n";
writer << "#include <tbb/flow_graph.h>";
......@@ -1150,6 +1154,14 @@ void runtime::cpu::CPU_ExternalFunction::build()
{
return;
}
if (m_use_tbb && (runtime::cpu::IsTracingEnabled() || m_emit_timing))
{
throw ngraph_error(
"CPU Backend: Tracing and performance breakdowns might not be accurate with TBB "
"enabled due to concurrent graph execution");
}
// stream writer to dump the debug manifest for the DEX
static const string s_debug_dir = "cpu_codegen";
static StaticInitializers s_static_initializers(s_debug_dir);
......@@ -1353,6 +1365,8 @@ void runtime::cpu::CPU_ExternalFunction::build()
enables.emplace_back(enable);
enable_nodename_list.emplace_back(make_pair(enable, node->get_name()));
m_perf_counters.emplace_back(node->get_name().c_str(), 0, 0);
}
if ((std::getenv("NGRAPH_DEX_DEBUG") != nullptr))
......@@ -1419,7 +1433,7 @@ void runtime::cpu::CPU_ExternalFunction::build()
assert(m_op_attrs.size() == functors.size());
executor = [&](CPURuntimeContext* ctx, vector<void*>& inputs, vector<void*>& outputs) {
cpu::Timestamp start_ts;
cpu::Timestamp start_ts, end_ts;
int profiler_count = 0;
if (ctx->first_iteration)
......@@ -1458,30 +1472,48 @@ void runtime::cpu::CPU_ExternalFunction::build()
auto it = enable_nodename_list.begin();
for (const auto& p : enables)
{
auto index = profiler_count++;
tbb::flow::continue_node<tbb::flow::continue_msg, tbb::flow::lightweight>*
flowgraph_node = new tbb::flow::continue_node<tbb::flow::continue_msg,
tbb::flow::lightweight>(
*(ctx->G), [&](const tbb::flow::continue_msg& msg) {
*(ctx->G), [&, index](const tbb::flow::continue_msg& msg) {
if (p(ctx) || ctx->first_iteration)
{
if (runtime::cpu::IsTracingEnabled())
if (runtime::cpu::IsTracingEnabled() || m_emit_timing)
{
start_ts = cpu::Clock::now();
}
(*functor)(ctx);
if (runtime::cpu::IsTracingEnabled())
if (runtime::cpu::IsTracingEnabled() || m_emit_timing)
{
ctx->op_durations[profiler_count++] =
(std::chrono::duration_cast<cpu::Timescale>(
cpu::Clock::now() - start_ts))
.count();
end_ts = cpu::Clock::now();
if (runtime::cpu::IsTracingEnabled())
{
ctx->op_durations[index] =
(std::chrono::duration_cast<cpu::Timescale>(
end_ts - start_ts))
.count();
}
if (m_emit_timing)
{
m_perf_counters[index].m_total_microseconds +=
std::chrono::duration_cast<
std::chrono::microseconds>(end_ts - start_ts)
.count();
m_perf_counters[index].m_call_count++;
}
}
}
else
{
if (runtime::cpu::IsTracingEnabled())
{
ctx->op_durations[profiler_count++] = 0;
ctx->op_durations[index] = 0;
}
if (m_emit_timing)
{
m_perf_counters[index].m_call_count++;
}
}
std::advance(functor, 1);
......@@ -1535,28 +1567,45 @@ void runtime::cpu::CPU_ExternalFunction::build()
{
for (const auto& p : enables)
{
auto index = profiler_count++;
if (p(ctx) || ctx->first_iteration)
{
// Each Op will have exactly one functor, start the clock before the exceution of functor
// and collect the profiler_count once the execution complets
if (runtime::cpu::IsTracingEnabled())
if (runtime::cpu::IsTracingEnabled() || m_emit_timing)
{
start_ts = cpu::Clock::now();
}
(*functor)(ctx);
if (runtime::cpu::IsTracingEnabled())
if (runtime::cpu::IsTracingEnabled() || m_emit_timing)
{
ctx->op_durations[profiler_count++] =
(std::chrono::duration_cast<cpu::Timescale>(cpu::Clock::now() -
start_ts))
.count();
end_ts = cpu::Clock::now();
if (runtime::cpu::IsTracingEnabled())
{
ctx->op_durations[index] =
(std::chrono::duration_cast<cpu::Timescale>(end_ts - start_ts))
.count();
}
if (m_emit_timing)
{
m_perf_counters[index].m_total_microseconds +=
std::chrono::duration_cast<std::chrono::microseconds>(end_ts -
start_ts)
.count();
m_perf_counters[index].m_call_count++;
}
}
}
else
{
if (runtime::cpu::IsTracingEnabled())
{
ctx->op_durations[profiler_count++] = 0;
ctx->op_durations[index] = 0;
}
if (m_emit_timing)
{
m_perf_counters[index].m_call_count++;
}
}
std::advance(functor, 1);
......@@ -1622,6 +1671,49 @@ const runtime::cpu::LayoutDescriptorPtrs&
return result_layout_descriptors;
}
const vector<runtime::PerformanceCounter>& runtime::cpu::CPU_ExternalFunction::get_perf_counters()
{
if (m_direct_execution)
{
return m_perf_counters;
}
#if !defined(NGRAPH_DEX_ONLY)
// Codegen. Retrieve perf counters from compiled module
if (m_execution_engine)
{
auto get_count = m_execution_engine->find_function<size_t()>("get_debug_timer_count");
auto get_name =
m_execution_engine->find_function<const char*(size_t)>("get_debug_timer_name");
auto get_microseconds =
m_execution_engine->find_function<size_t(size_t)>("get_debug_timer_microseconds");
auto get_call_count =
m_execution_engine->find_function<size_t(size_t)>("get_debug_timer_call_count");
if (get_count && get_name && get_microseconds && get_call_count)
{
size_t count = get_count();
if (m_perf_counters.size() == 0)
{
for (size_t i = 0; i < count; i++)
{
m_perf_counters.push_back(
{get_name(i), get_microseconds(i), get_call_count(i)});
}
}
else
{
for (size_t i = 0; i < count; i++)
{
m_perf_counters[i].m_total_microseconds = get_microseconds(i);
m_perf_counters[i].m_call_count = get_call_count(i);
}
}
}
}
return m_perf_counters;
#endif
}
void runtime::cpu::CPU_ExternalFunction::write_to_file(const std::string& code,
const std::string& directory,
const std::string& filename)
......
......@@ -45,6 +45,7 @@
#include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
#include "ngraph/runtime/cpu/cpu_tensor_view_wrapper.hpp"
#include "ngraph/runtime/cpu/mkldnn_emitter.hpp"
#include "ngraph/runtime/performance_counter.hpp"
namespace ngraph
{
......@@ -138,6 +139,8 @@ namespace ngraph
const std::string& directory,
const std::string& filename);
const std::vector<PerformanceCounter>& get_perf_counters();
#if defined(NGRAPH_HALIDE)
std::unordered_map<std::string, Halide::Func>& get_halide_functions()
{
......@@ -182,7 +185,7 @@ namespace ngraph
std::string output_name,
bool dex);
bool computes_result(Node* node);
void release_function() { m_function = nullptr; }
#if !defined(NGRAPH_DEX_ONLY)
void emit_debug_function_entry(codegen::CodeWriter& writer,
Node* node,
......@@ -204,22 +207,9 @@ namespace ngraph
std::string emit_op_as_function(const Node&, const std::string& function_name);
std::string strip_comments(const std::string&);
#endif
void release_function() { m_function = nullptr; }
std::shared_ptr<ngraph::Function> m_function;
bool m_release_function;
bool m_use_tbb;
EntryPoint m_compiled_function;
std::unordered_map<std::string, std::string> m_variable_name_map;
#if !defined(NGRAPH_DEX_ONLY)
bool m_is_compiled;
std::unique_ptr<codegen::Compiler> m_compiler;
std::unique_ptr<codegen::ExecutionEngine> m_execution_engine;
bool m_emit_timing;
std::map<std::string, size_t> m_name_index_map;
......@@ -227,8 +217,17 @@ namespace ngraph
// Constant ops we need to keep a list of shared_ptr to each Constant
// so they don't get freed before we are done with them
std::vector<std::shared_ptr<Node>> m_active_constants;
#endif
std::shared_ptr<ngraph::Function> m_function;
bool m_release_function;
bool m_emit_timing;
bool m_use_tbb;
EntryPoint m_compiled_function;
std::unordered_map<std::string, std::string> m_variable_name_map;
std::unordered_map<std::string, CPUTensorRole> m_tensor_roles;
LayoutDescriptorPtrs parameter_layout_descriptors;
......@@ -257,6 +256,7 @@ namespace ngraph
std::unordered_map<std::string, std::shared_ptr<CPU_ExternalFunction>> callees;
bool m_is_built;
bool m_direct_execution;
std::vector<runtime::PerformanceCounter> m_perf_counters;
#if defined(NGRAPH_HALIDE)
std::unordered_map<std::string, Halide::Func> halide_functions;
......
......@@ -34,9 +34,11 @@ namespace ngraph
}
const std::string& name() const { return m_name; }
size_t total_microseconds() const { return m_total_microseconds; }
size_t microseconds() const { return m_total_microseconds / m_call_count; }
size_t microseconds() const
{
return m_call_count == 0 ? 0 : m_total_microseconds / m_call_count;
}
size_t call_count() const { return m_call_count; }
private:
std::string m_name;
size_t m_total_microseconds;
size_t m_call_count;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment