Commit 0b99a7a1 authored by fenglei.tian's avatar fenglei.tian

Merge remote-tracking branch 'origin/master' into tfl/gpu_dot_back

parents bc4aefed 41a883b1
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#pragma once #pragma once
#include <functional>
#include <list> #include <list>
#include <memory> #include <memory>
#include <string> #include <string>
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#pragma once #pragma once
#include <exception> #include <exception>
#include <functional>
#include <sstream> #include <sstream>
#include "ngraph/pass/pass.hpp" #include "ngraph/pass/pass.hpp"
......
...@@ -146,6 +146,7 @@ void runtime::cpu::CPU_CallFrame::setup_runtime_context() ...@@ -146,6 +146,7 @@ void runtime::cpu::CPU_CallFrame::setup_runtime_context()
} }
const auto& mkldnn_emitter = m_external_function->get_mkldnn_emitter(); const auto& mkldnn_emitter = m_external_function->get_mkldnn_emitter();
ctx->mkldnn_primitives = mkldnn_emitter->get_mkldnn_primitives().data(); ctx->mkldnn_primitives = mkldnn_emitter->get_mkldnn_primitives().data();
ctx->mkldnn_workspaces = mkldnn_emitter->get_mkldnn_workspaces().data();
} }
void runtime::cpu::CPU_CallFrame::cleanup_runtime_context() void runtime::cpu::CPU_CallFrame::cleanup_runtime_context()
......
This diff is collapsed.
...@@ -258,7 +258,7 @@ runtime::cpu::CPU_ExternalFunction::CPU_ExternalFunction( ...@@ -258,7 +258,7 @@ runtime::cpu::CPU_ExternalFunction::CPU_ExternalFunction(
const shared_ptr<ngraph::Function>& function, bool release_function) const shared_ptr<ngraph::Function>& function, bool release_function)
: ngraph::runtime::ExternalFunction(function, release_function) : ngraph::runtime::ExternalFunction(function, release_function)
, m_compiled_function(nullptr) , m_compiled_function(nullptr)
, m_emit_timing(std::getenv("NGRAPH_CPU_EMIT_TIMING") != nullptr) , m_emit_timing(false)
, m_use_tbb(std::getenv("NGRAPH_CPU_USE_TBB") != nullptr) , m_use_tbb(std::getenv("NGRAPH_CPU_USE_TBB") != nullptr)
, m_function_name(function->get_name()) , m_function_name(function->get_name())
{ {
...@@ -275,6 +275,8 @@ void runtime::cpu::CPU_ExternalFunction::compile() ...@@ -275,6 +275,8 @@ void runtime::cpu::CPU_ExternalFunction::compile()
return; return;
} }
m_emit_timing = m_timing | (std::getenv("NGRAPH_CPU_EMIT_TIMING") != nullptr);
m_mkldnn_emitter.reset(new MKLDNNEmitter()); m_mkldnn_emitter.reset(new MKLDNNEmitter());
ngraph::pass::Manager pass_manager; ngraph::pass::Manager pass_manager;
...@@ -370,6 +372,7 @@ using namespace ngraph::runtime; ...@@ -370,6 +372,7 @@ using namespace ngraph::runtime;
{ {
writer << "// Declare debug timers\n"; writer << "// Declare debug timers\n";
vector<string> names; vector<string> names;
size_t index = 0;
for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions()) for (shared_ptr<Function> current_function : pass_manager.get_state().get_functions())
{ {
for (shared_ptr<Node> node : current_function->get_ordered_ops()) for (shared_ptr<Node> node : current_function->get_ordered_ops())
...@@ -377,59 +380,43 @@ using namespace ngraph::runtime; ...@@ -377,59 +380,43 @@ using namespace ngraph::runtime;
if (!node->is_parameter() && !node->is_constant()) if (!node->is_parameter() && !node->is_constant())
{ {
names.push_back(node->get_name()); names.push_back(node->get_name());
m_name_index_map.insert({node->get_name(), index++});
} }
} }
} }
for (const string& s : names) writer << "ngraph::stopwatch timers[" << names.size() << "];\n";
{
writer << "ngraph::stopwatch timer_" << s << ";\n";
}
writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size() writer << "extern \"C\" size_t get_debug_timer_count() { return " << names.size()
<< "; }\n"; << "; }\n";
writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n"; writer << "extern \"C\" const char* get_debug_timer_name(size_t index)\n";
writer << "{\n"; writer << "{\n";
writer.indent++; writer.indent++;
writer << "const char* rc;\n"; writer << "static const char* timer_names[" << names.size() << "] =\n";
writer << "switch(index)\n";
writer << "{\n"; writer << "{\n";
for (size_t i = 0; i < names.size(); i++) writer.indent++;
vector<string> quoted_names;
for (const string& name : names)
{ {
writer << "case " << i << ": rc = \"" << names[i] << "\"; break;\n"; quoted_names.push_back("\"" + name + "\"");
} }
writer << "default: rc = \"\";\n"; writer << emit_string_array(quoted_names, 100 - (4 * 2 + 1));
writer << "}\n"; writer << "\n};\n";
writer << "return rc;\n"; writer.indent--;
writer << "return timer_names[index];\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n"; writer << "extern \"C\" const size_t get_debug_timer_microseconds(size_t index)\n";
writer << "{\n"; writer << "{\n";
writer.indent++; writer.indent++;
writer << "size_t rc;\n"; writer << "return (index < " << names.size()
writer << "switch(index)\n"; << " ? timers[index].get_total_microseconds() : 0);\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = timer_" << names[i]
<< ".get_total_microseconds(); break;\n";
}
writer << "default: rc = 0;\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n"; writer << "extern \"C\" const size_t get_debug_timer_call_count(size_t index)\n";
writer << "{\n"; writer << "{\n";
writer.indent++; writer.indent++;
writer << "size_t rc;\n"; writer << "return (index < " << names.size() << " ? timers[index].get_call_count() : 0);\n";
writer << "switch(index)\n";
writer << "{\n";
for (size_t i = 0; i < names.size(); i++)
{
writer << "case " << i << ": rc = timer_" << names[i] << ".get_call_count(); break;\n";
}
writer << "default: rc = 0;\n";
writer << "}\n";
writer << "return rc;\n";
writer.indent--; writer.indent--;
writer << "}\n"; writer << "}\n";
writer << "\n"; writer << "\n";
...@@ -443,15 +430,11 @@ using namespace ngraph::runtime; ...@@ -443,15 +430,11 @@ using namespace ngraph::runtime;
const ngraph::op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get()); const ngraph::op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
if (c) if (c)
{ {
m_active_constants.push_back(node);
shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view(); shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
auto c_value_strings = c->get_value_strings(); string type = tv->get_tensor().get_element_type().c_type_string();
writer << "static " << tv->get_tensor().get_element_type().c_type_string() << " " writer << "static " << type << "* " << tv->get_tensor().get_name() << " = (("
<< tv->get_tensor().get_name() << "[" << c_value_strings.size() << "] =\n"; << type << "*)(" << c->get_data_ptr() << "));\n";
writer << "{\n";
writer.indent++;
writer << emit_string_array(c_value_strings, 100 - writer.indent * 4);
writer.indent--;
writer << "\n};\n\n";
m_variable_name_map[tv->get_tensor().get_name()] = tv->get_tensor().get_name(); m_variable_name_map[tv->get_tensor().get_name()] = tv->get_tensor().get_name();
} }
} }
...@@ -702,10 +685,6 @@ using namespace ngraph::runtime; ...@@ -702,10 +685,6 @@ using namespace ngraph::runtime;
<< "(G, [&](const tbb::flow::continue_msg &msg)\n{\n"; << "(G, [&](const tbb::flow::continue_msg &msg)\n{\n";
writer.indent++; writer.indent++;
} }
if (m_emit_timing)
{
emit_debug_function_entry(writer, node.get(), in, out);
}
if (runtime::cpu::IsTracingEnabled() && if (runtime::cpu::IsTracingEnabled() &&
current_function->get_name() == m_function_name) current_function->get_name() == m_function_name)
{ {
...@@ -713,14 +692,21 @@ using namespace ngraph::runtime; ...@@ -713,14 +692,21 @@ using namespace ngraph::runtime;
} }
} }
if (!node->is_parameter() && !node->is_constant())
{
writer << "\n// " << node->get_name() << "("; writer << "\n// " << node->get_name() << "(";
vector<string> parameter_nodes = node_input_names; vector<string> parameter_nodes = node_input_names;
parameter_nodes.insert( parameter_nodes.insert(
parameter_nodes.end(), node_output_names.begin(), node_output_names.end()); parameter_nodes.end(), node_output_names.begin(), node_output_names.end());
writer << join(parameter_nodes); writer << join(parameter_nodes);
writer << ")\n"; writer << ")\n";
}
// Emit operation body // Emit operation body
if (!node->is_parameter() && !node->is_constant())
{
emit_debug_function_entry(writer, node.get(), in, out);
}
string func_name; string func_name;
auto it = match_functions.find(node.get()); auto it = match_functions.find(node.get());
if (it == match_functions.end()) if (it == match_functions.end())
...@@ -744,11 +730,8 @@ using namespace ngraph::runtime; ...@@ -744,11 +730,8 @@ using namespace ngraph::runtime;
// Emit operation epilogue // Emit operation epilogue
if (!node->is_parameter() && !node->is_constant()) if (!node->is_parameter() && !node->is_constant())
{
if (m_emit_timing)
{ {
emit_debug_function_exit(writer, node.get(), in, out); emit_debug_function_exit(writer, node.get(), in, out);
}
if (runtime::cpu::IsTracingEnabled() && if (runtime::cpu::IsTracingEnabled() &&
current_function->get_name() == m_function_name) current_function->get_name() == m_function_name)
{ {
...@@ -909,7 +892,10 @@ void runtime::cpu::CPU_ExternalFunction::emit_debug_function_entry( ...@@ -909,7 +892,10 @@ void runtime::cpu::CPU_ExternalFunction::emit_debug_function_entry(
const std::vector<TensorViewWrapper>& in, const std::vector<TensorViewWrapper>& in,
const std::vector<TensorViewWrapper>& out) const std::vector<TensorViewWrapper>& out)
{ {
writer << "timer_" << node->get_name() << ".start();\n"; if (m_emit_timing)
{
writer << "timers[" << m_name_index_map[node->get_name()] << "].start();\n";
}
} }
void runtime::cpu::CPU_ExternalFunction::emit_debug_function_exit( void runtime::cpu::CPU_ExternalFunction::emit_debug_function_exit(
...@@ -918,7 +904,10 @@ void runtime::cpu::CPU_ExternalFunction::emit_debug_function_exit( ...@@ -918,7 +904,10 @@ void runtime::cpu::CPU_ExternalFunction::emit_debug_function_exit(
const std::vector<TensorViewWrapper>& in, const std::vector<TensorViewWrapper>& in,
const std::vector<TensorViewWrapper>& out) const std::vector<TensorViewWrapper>& out)
{ {
writer << "timer_" << node->get_name() << ".stop();\n"; if (m_emit_timing)
{
writer << "timers[" << m_name_index_map[node->get_name()] << "].stop();\n";
}
} }
bool runtime::cpu::CPU_ExternalFunction::is_functionally_identical( bool runtime::cpu::CPU_ExternalFunction::is_functionally_identical(
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#pragma once #pragma once
#include <functional> #include <functional>
#include <map>
#include <memory> #include <memory>
#include <string> #include <string>
#include <typeindex> #include <typeindex>
...@@ -119,6 +120,12 @@ namespace ngraph ...@@ -119,6 +120,12 @@ namespace ngraph
bool m_emit_timing; bool m_emit_timing;
bool m_use_tbb; bool m_use_tbb;
std::unordered_map<std::string, std::string> m_variable_name_map; std::unordered_map<std::string, std::string> m_variable_name_map;
std::map<std::string, size_t> m_name_index_map;
// Because we are directly accessing the constant data stored in the
// Constant ops we need to keep a list of shared_ptr to each Constant
// so they don't get freed before we are done with them
std::vector<std::shared_ptr<Node>> m_active_constants;
LayoutDescriptorPtrs parameter_layout_descriptors; LayoutDescriptorPtrs parameter_layout_descriptors;
LayoutDescriptorPtrs result_layout_descriptors; LayoutDescriptorPtrs result_layout_descriptors;
......
...@@ -37,6 +37,7 @@ namespace ngraph ...@@ -37,6 +37,7 @@ namespace ngraph
{ {
int64_t* op_durations; int64_t* op_durations;
mkldnn::primitive* const* mkldnn_primitives; mkldnn::primitive* const* mkldnn_primitives;
char* const* mkldnn_workspaces;
}; };
} }
} }
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp" #include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
#include "ngraph/runtime/cpu/cpu_tensor_view_wrapper.hpp" #include "ngraph/runtime/cpu/cpu_tensor_view_wrapper.hpp"
#include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
#include "ngraph/runtime/cpu/mkldnn_utils.hpp" #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
using namespace ngraph::runtime::cpu; using namespace ngraph::runtime::cpu;
...@@ -36,12 +37,24 @@ const std::vector<mkldnn::primitive*>& MKLDNNEmitter::get_mkldnn_primitives() co ...@@ -36,12 +37,24 @@ const std::vector<mkldnn::primitive*>& MKLDNNEmitter::get_mkldnn_primitives() co
return m_mkldnn_primitives; return m_mkldnn_primitives;
} }
const std::vector<char*>& MKLDNNEmitter::get_mkldnn_workspaces()
{
return m_workspace_bufs;
}
size_t MKLDNNEmitter::insert_primitive(mkldnn::primitive* primitive) size_t MKLDNNEmitter::insert_primitive(mkldnn::primitive* primitive)
{ {
m_mkldnn_primitives.emplace_back(primitive); m_mkldnn_primitives.emplace_back(primitive);
return (m_mkldnn_primitives.size() - 1); return (m_mkldnn_primitives.size() - 1);
} }
size_t MKLDNNEmitter::insert_workspace(std::unique_ptr<MKLDNNWorkspace>& workspace)
{
m_workspace_bufs.push_back(workspace.get()->buf);
m_workspaces.push_back(std::move(workspace));
return (m_workspaces.size() - 1);
}
const std::vector<size_t>& MKLDNNEmitter::get_primitive_deps(size_t index) const const std::vector<size_t>& MKLDNNEmitter::get_primitive_deps(size_t index) const
{ {
return m_primitive_deps.at(index); return m_primitive_deps.at(index);
...@@ -321,6 +334,105 @@ size_t MKLDNNEmitter::build_pooling_forward(mkldnn::algorithm pooling_algorithm, ...@@ -321,6 +334,105 @@ size_t MKLDNNEmitter::build_pooling_forward(mkldnn::algorithm pooling_algorithm,
return primitive_index; return primitive_index;
} }
size_t MKLDNNEmitter::build_pooling_backward(mkldnn::algorithm pooling_algorithm,
const mkldnn::memory::desc& diff_dst_desc,
const mkldnn::memory::desc& diff_src_desc,
const ngraph::Strides& window_strides,
const ngraph::Shape& window_shape,
const ngraph::Shape& padding_below,
const ngraph::Shape& padding_above)
{
size_t input_index = build_memory_primitive(diff_dst_desc);
size_t result_index = build_memory_primitive(diff_src_desc);
size_t primitive_index = insert_primitive(new mkldnn::pooling_backward(
{{pooling_algorithm,
diff_src_desc,
diff_dst_desc,
mkldnn::memory::dims(window_strides.begin(), window_strides.end()),
mkldnn::memory::dims(window_shape.begin(), window_shape.end()),
mkldnn::memory::dims(padding_below.begin(), padding_below.end()),
mkldnn::memory::dims(padding_above.begin(), padding_above.end()),
mkldnn::padding_kind::zero},
mkldnn_utils::global_cpu_engine,
{{mkldnn::prop_kind::forward_training,
pooling_algorithm,
diff_src_desc,
diff_dst_desc,
mkldnn::memory::dims(window_strides.begin(), window_strides.end()),
mkldnn::memory::dims(window_shape.begin(), window_shape.end()),
mkldnn::memory::dims(padding_below.begin(), padding_below.end()),
mkldnn::memory::dims(padding_above.begin(), padding_above.end()),
mkldnn::padding_kind::zero},
mkldnn_utils::global_cpu_engine}},
*m_mkldnn_primitives[input_index],
*m_mkldnn_primitives[result_index]));
m_primitive_deps[primitive_index] = {input_index, result_index};
return primitive_index;
}
size_t MKLDNNEmitter::build_max_pooling_backward(mkldnn::algorithm pooling_algorithm,
const mkldnn::memory::desc& fprop_src_desc,
const mkldnn::memory::desc& diff_dst_desc,
const mkldnn::memory::desc& diff_src_desc,
const ngraph::Strides& window_strides,
const ngraph::Shape& window_shape,
const ngraph::Shape& padding_below,
const ngraph::Shape& padding_above)
{
size_t fprop_src_index = build_memory_primitive(fprop_src_desc);
size_t diff_dst_index = build_memory_primitive(diff_dst_desc);
size_t diff_src_index = build_memory_primitive(diff_src_desc);
mkldnn::pooling_forward::primitive_desc fwd_pd{
{mkldnn::prop_kind::forward_training,
pooling_algorithm,
diff_src_desc,
diff_dst_desc,
mkldnn::memory::dims(window_strides.begin(), window_strides.end()),
mkldnn::memory::dims(window_shape.begin(), window_shape.end()),
mkldnn::memory::dims(padding_below.begin(), padding_below.end()),
mkldnn::memory::dims(padding_above.begin(), padding_above.end()),
mkldnn::padding_kind::zero},
mkldnn_utils::global_cpu_engine};
auto ws_index = build_memory_primitive(fwd_pd.workspace_primitive_desc().desc());
// Allocate workspace
// TODO (jbobba): Might need to align memory
auto ws = std::unique_ptr<MKLDNNWorkspace>(
new MKLDNNWorkspace(fwd_pd.workspace_primitive_desc().get_size()));
auto ws_buf_index = insert_workspace(ws);
size_t fwd_primitive_index = insert_primitive(new mkldnn::pooling_forward(
fwd_pd,
*m_mkldnn_primitives[fprop_src_index],
*m_mkldnn_primitives
[diff_src_index], // HACK - Uses diff_src buffer. Safe since diff_src > fprop_dst
*m_mkldnn_primitives[ws_index]));
size_t bwd_primitive_index = insert_primitive(new mkldnn::pooling_backward(
{{pooling_algorithm,
diff_src_desc,
diff_dst_desc,
mkldnn::memory::dims(window_strides.begin(), window_strides.end()),
mkldnn::memory::dims(window_shape.begin(), window_shape.end()),
mkldnn::memory::dims(padding_below.begin(), padding_below.end()),
mkldnn::memory::dims(padding_above.begin(), padding_above.end()),
mkldnn::padding_kind::zero},
mkldnn_utils::global_cpu_engine,
fwd_pd},
*m_mkldnn_primitives[diff_dst_index],
*m_mkldnn_primitives[ws_index],
*m_mkldnn_primitives[diff_src_index]));
m_primitive_deps[fwd_primitive_index] = {
fprop_src_index, diff_src_index, ws_index, ws_buf_index};
m_primitive_deps[bwd_primitive_index] = {
diff_dst_index, ws_index, diff_src_index, ws_buf_index};
return bwd_primitive_index;
}
size_t MKLDNNEmitter::build_reorder(const mkldnn::memory::desc& input_desc, size_t MKLDNNEmitter::build_reorder(const mkldnn::memory::desc& input_desc,
const mkldnn::memory::desc& result_desc) const mkldnn::memory::desc& result_desc)
{ {
......
...@@ -35,6 +35,14 @@ namespace ngraph ...@@ -35,6 +35,14 @@ namespace ngraph
class CPU_ExternalFunction; class CPU_ExternalFunction;
class TensorViewWrapper; class TensorViewWrapper;
class MKLDNNWorkspace
{
public:
MKLDNNWorkspace(size_t size) { buf = reinterpret_cast<char*>(malloc(size)); }
~MKLDNNWorkspace() { free(buf); }
char* buf;
};
class MKLDNNEmitter class MKLDNNEmitter
{ {
public: public:
...@@ -42,8 +50,10 @@ namespace ngraph ...@@ -42,8 +50,10 @@ namespace ngraph
~MKLDNNEmitter(); ~MKLDNNEmitter();
const std::vector<mkldnn::primitive*>& get_mkldnn_primitives() const; const std::vector<mkldnn::primitive*>& get_mkldnn_primitives() const;
const std::vector<char*>& get_mkldnn_workspaces();
size_t insert_primitive(mkldnn::primitive* primitive); size_t insert_primitive(mkldnn::primitive* primitive);
size_t insert_workspace(std::unique_ptr<MKLDNNWorkspace>& workspace);
const std::vector<size_t>& get_primitive_deps(size_t index) const; const std::vector<size_t>& get_primitive_deps(size_t index) const;
// TODO(jmenon): Get rid of TensorViewWrappers at some point // TODO(jmenon): Get rid of TensorViewWrappers at some point
...@@ -109,6 +119,23 @@ namespace ngraph ...@@ -109,6 +119,23 @@ namespace ngraph
const ngraph::Shape& padding_below, const ngraph::Shape& padding_below,
const ngraph::Shape& padding_above); const ngraph::Shape& padding_above);
size_t build_pooling_backward(mkldnn::algorithm pooling_algorithm,
const mkldnn::memory::desc& diff_dst_desc,
const mkldnn::memory::desc& diff_src_desc,
const ngraph::Strides& window_strides,
const ngraph::Shape& window_shape,
const ngraph::Shape& padding_below,
const ngraph::Shape& padding_above);
size_t build_max_pooling_backward(mkldnn::algorithm pooling_algorithm,
const mkldnn::memory::desc& fprop_src_desc,
const mkldnn::memory::desc& diff_dst_desc,
const mkldnn::memory::desc& diff_src_desc,
const ngraph::Strides& window_strides,
const ngraph::Shape& window_shape,
const ngraph::Shape& padding_below,
const ngraph::Shape& padding_above);
size_t build_reorder(const mkldnn::memory::desc& input_desc, size_t build_reorder(const mkldnn::memory::desc& input_desc,
const mkldnn::memory::desc& result_desc); const mkldnn::memory::desc& result_desc);
...@@ -129,6 +156,8 @@ namespace ngraph ...@@ -129,6 +156,8 @@ namespace ngraph
std::vector<mkldnn::primitive*> m_mkldnn_primitives; std::vector<mkldnn::primitive*> m_mkldnn_primitives;
std::vector<mkldnn::stream> m_mkldnn_streams; std::vector<mkldnn::stream> m_mkldnn_streams;
std::unordered_map<size_t, std::vector<size_t>> m_primitive_deps; std::unordered_map<size_t, std::vector<size_t>> m_primitive_deps;
std::vector<std::unique_ptr<MKLDNNWorkspace>> m_workspaces;
std::vector<char*> m_workspace_bufs;
}; };
} }
} }
......
...@@ -29,6 +29,7 @@ ...@@ -29,6 +29,7 @@
#include "ngraph/ops/avg_pool.hpp" #include "ngraph/ops/avg_pool.hpp"
#include "ngraph/ops/batch_norm.hpp" #include "ngraph/ops/batch_norm.hpp"
#include "ngraph/ops/convolution.hpp" #include "ngraph/ops/convolution.hpp"
#include "ngraph/ops/max_pool.hpp"
#include "ngraph/ops/relu.hpp" #include "ngraph/ops/relu.hpp"
#include "ngraph/runtime/cpu/cpu_op_annotations.hpp" #include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
#include "ngraph/runtime/cpu/mkldnn_utils.hpp" #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
...@@ -245,10 +246,48 @@ namespace ngraph ...@@ -245,10 +246,48 @@ namespace ngraph
} }
} }
template <>
void CPUAssignment::ASSIGN_DECL(ngraph::op::MaxPool)
{
auto max_pool = static_cast<op::MaxPool*>(node);
auto arg0_shape = node->get_input_shape(0);
auto arg0_rank = arg0_shape.size();
auto result_shape = node->get_output_shape(0);
if (arg0_rank == 4 && max_pool->get_window_shape().size() == 2 &&
node->get_input_element_type(0) == element::f32)
{
auto op_annotations =
std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
op_annotations->set_mkldnn_op(true);
max_pool->set_op_annotations(op_annotations);
}
}
template <>
void CPUAssignment::ASSIGN_DECL(ngraph::op::MaxPoolBackprop)
{
auto max_pool = static_cast<op::MaxPoolBackprop*>(node);
auto arg1_shape = node->get_input_shape(1);
auto arg1_rank = arg1_shape.size();
auto result_shape = node->get_output_shape(0);
if (arg1_rank == 4 && max_pool->get_window_shape().size() == 2 &&
node->get_input_element_type(1) == element::f32)
{
auto op_annotations =
std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
op_annotations->set_mkldnn_op(true);
max_pool->set_op_annotations(op_annotations);
}
}
template <> template <>
void CPUAssignment::ASSIGN_DECL(ngraph::op::Relu) void CPUAssignment::ASSIGN_DECL(ngraph::op::Relu)
{ {
auto avg_pool = static_cast<op::Relu*>(node); auto relu = static_cast<op::Relu*>(node);
auto arg0_shape = node->get_input_shape(0); auto arg0_shape = node->get_input_shape(0);
auto arg0_rank = arg0_shape.size(); auto arg0_rank = arg0_shape.size();
...@@ -260,7 +299,7 @@ namespace ngraph ...@@ -260,7 +299,7 @@ namespace ngraph
auto op_annotations = auto op_annotations =
std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>(); std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
op_annotations->set_mkldnn_op(true); op_annotations->set_mkldnn_op(true);
avg_pool->set_op_annotations(op_annotations); relu->set_op_annotations(op_annotations);
} }
} }
...@@ -280,18 +319,19 @@ namespace ngraph ...@@ -280,18 +319,19 @@ namespace ngraph
template <> template <>
void CPUAssignment::ASSIGN_DECL(ngraph::op::ReluBackprop) void CPUAssignment::ASSIGN_DECL(ngraph::op::ReluBackprop)
{ {
auto avg_pool = static_cast<op::ReluBackprop*>(node); auto relu_bprop = static_cast<op::ReluBackprop*>(node);
auto arg0_shape = node->get_input_shape(0); auto arg0_shape = node->get_input_shape(0);
auto arg0_rank = arg0_shape.size(); auto arg0_rank = arg0_shape.size();
auto result_shape = node->get_output_shape(0); auto result_shape = node->get_output_shape(0);
if (arg0_rank == 4 && node->get_input_element_type(0) == element::f32) if ((arg0_rank == 4 || arg0_rank == 2) &&
node->get_input_element_type(0) == element::f32)
{ {
auto op_annotations = auto op_annotations =
std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>(); std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
op_annotations->set_mkldnn_op(true); op_annotations->set_mkldnn_op(true);
avg_pool->set_op_annotations(op_annotations); relu_bprop->set_op_annotations(op_annotations);
} }
} }
...@@ -313,6 +353,9 @@ namespace ngraph ...@@ -313,6 +353,9 @@ namespace ngraph
static const runtime::cpu::pass::AssignOpMap s_dispatcher{ static const runtime::cpu::pass::AssignOpMap s_dispatcher{
{TI(ngraph::op::Add), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Add>}, {TI(ngraph::op::Add), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Add>},
{TI(ngraph::op::AvgPool), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::AvgPool>},
{TI(ngraph::op::AvgPoolBackprop),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::AvgPoolBackprop>},
{TI(ngraph::op::BatchNorm), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::BatchNorm>}, {TI(ngraph::op::BatchNorm), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::BatchNorm>},
{TI(ngraph::op::Convolution), {TI(ngraph::op::Convolution),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Convolution>}, &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Convolution>},
...@@ -320,13 +363,13 @@ static const runtime::cpu::pass::AssignOpMap s_dispatcher{ ...@@ -320,13 +363,13 @@ static const runtime::cpu::pass::AssignOpMap s_dispatcher{
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionBackpropData>}, &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionBackpropData>},
{TI(ngraph::op::ConvolutionBackpropFilters), {TI(ngraph::op::ConvolutionBackpropFilters),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionBackpropFilters>}, &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionBackpropFilters>},
{TI(ngraph::op::MaxPool), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::MaxPool>},
{TI(ngraph::op::MaxPoolBackprop),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::MaxPoolBackprop>},
{TI(ngraph::op::ConvolutionBias), {TI(ngraph::op::ConvolutionBias),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionBias>}, &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionBias>},
{TI(ngraph::op::ConvolutionBiasBackpropFiltersBias), {TI(ngraph::op::ConvolutionBiasBackpropFiltersBias),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionBiasBackpropFiltersBias>}, &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionBiasBackpropFiltersBias>},
{TI(ngraph::op::AvgPool), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::AvgPool>},
{TI(ngraph::op::AvgPoolBackprop),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::AvgPoolBackprop>},
{TI(ngraph::op::Relu), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Relu>}, {TI(ngraph::op::Relu), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Relu>},
{TI(ngraph::op::ReluBackprop), {TI(ngraph::op::ReluBackprop),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ReluBackprop>}, &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ReluBackprop>},
......
This diff is collapsed.
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
* limitations under the License. * limitations under the License.
*******************************************************************************/ *******************************************************************************/
#include <functional>
#include <memory> #include <memory>
#include <typeindex> #include <typeindex>
#include <typeinfo> #include <typeinfo>
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <memory> #include <memory>
#include "ngraph/function.hpp" #include "ngraph/function.hpp"
#include "ngraph/log.hpp"
namespace ngraph namespace ngraph
{ {
...@@ -34,6 +35,7 @@ namespace ngraph ...@@ -34,6 +35,7 @@ namespace ngraph
: m_function(function) : m_function(function)
, m_release_function(release_function) , m_release_function(release_function)
, m_is_compiled(false) , m_is_compiled(false)
, m_timing(false)
{ {
} }
...@@ -42,12 +44,13 @@ namespace ngraph ...@@ -42,12 +44,13 @@ namespace ngraph
public: public:
virtual ~ExternalFunction() {} virtual ~ExternalFunction() {}
virtual std::shared_ptr<CallFrame> make_call_frame() = 0; virtual std::shared_ptr<CallFrame> make_call_frame() = 0;
void set_emit_timing(bool enable) { m_timing = enable; }
const std::shared_ptr<ngraph::Function> get_function() { return m_function; } const std::shared_ptr<ngraph::Function> get_function() { return m_function; }
protected: protected:
std::shared_ptr<ngraph::Function> m_function; std::shared_ptr<ngraph::Function> m_function;
bool m_release_function; bool m_release_function;
bool m_is_compiled; bool m_is_compiled;
bool m_timing;
}; };
} }
} }
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
*******************************************************************************/ *******************************************************************************/
#include <dlfcn.h> #include <dlfcn.h>
#include <functional>
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include <string> #include <string>
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#pragma once #pragma once
#include <functional>
#include <map> #include <map>
#include <memory> #include <memory>
#include <string> #include <string>
......
...@@ -21,34 +21,42 @@ ...@@ -21,34 +21,42 @@
// sample models are under ../../test/models // sample models are under ../../test/models
#include <fstream> #include <fstream>
#include <ngraph/file_util.hpp>
#include <ngraph/runtime/backend.hpp> #include <ngraph/runtime/backend.hpp>
#include <ngraph/runtime/call_frame.hpp> #include <ngraph/runtime/call_frame.hpp>
#include <ngraph/runtime/manager.hpp> #include <ngraph/runtime/manager.hpp>
#include <ngraph/util.hpp>
#include "util/benchmark.hpp" #include "util/benchmark.hpp"
#include "util/test_tools.hpp" #include "util/test_tools.hpp"
using namespace std; using namespace std;
using namespace ngraph;
int main(int argc, char** argv) int main(int argc, char** argv)
{ {
string model = "model.json"; string model;
string backend = "INTERPRETER"; string backend = "CPU";
int iter = 10; int iterations = 10;
bool failed = false; bool failed = false;
bool statistics = false;
bool timing_detail = false;
for (size_t i = 1; i < argc; i++) for (size_t i = 1; i < argc; i++)
{ {
if (string(argv[i]) == "-f") string arg = argv[i];
if (arg == "-f" || arg == "--file")
{ {
model = argv[++i]; model = argv[++i];
} }
else if (string(argv[i]) == "-b") else if (arg == "-b" || arg == "--backend")
{ {
backend = argv[++i]; backend = argv[++i];
} }
else if (string(argv[i]) == "-i") else if (arg == "-i" || arg == "--iterations")
{ {
try try
{ {
iter = stoi(argv[++i]); iterations = stoi(argv[++i]);
} }
catch (...) catch (...)
{ {
...@@ -56,6 +64,19 @@ int main(int argc, char** argv) ...@@ -56,6 +64,19 @@ int main(int argc, char** argv)
failed = true; failed = true;
} }
} }
else if (arg == "-s" || arg == "--statistics")
{
statistics = true;
}
else if (arg == "--timing_detail")
{
timing_detail = true;
}
else
{
cout << "Unknown option: " << arg << endl;
failed = true;
}
} }
if (!static_cast<bool>(ifstream(model))) if (!static_cast<bool>(ifstream(model)))
{ {
...@@ -73,12 +94,58 @@ SYNOPSIS ...@@ -73,12 +94,58 @@ SYNOPSIS
nbench [-f <filename>] [-b <backend>] [-i <iterations>] nbench [-f <filename>] [-b <backend>] [-i <iterations>]
OPTIONS OPTIONS
-f model json file to use (default: model.json) -f|--file Serialized model file
-b Backend to use (default: INTERPRETER) -b|--backend Backend to use (default: CPU)
-i Iterations (default: 10) -i|--iterations Iterations (default: 10)
-s|--statistics Display op stastics
--timing_detail Gather detailed timing
)###"; )###";
return 1; return 1;
} }
cout << "Benchmarking " << model << ", " << backend << " backend, " << iter << " iterations.\n";
run_benchmark(model, backend, iter); const string json_string = file_util::read_file_to_string(model);
stringstream ss(json_string);
shared_ptr<Function> f = deserialize(ss);
if (statistics)
{
cout << "statistics:" << endl;
cout << "total nodes: " << f->get_ops().size() << endl;
size_t total_constant_bytes = 0;
unordered_map<string, size_t> op_list;
for (shared_ptr<Node> node : f->get_ordered_ops())
{
string name = node->get_name();
string op_name = name.substr(0, name.find('_'));
string shape_name = "{" + join(node->get_outputs()[0].get_shape()) + "}";
op_list[op_name + shape_name]++;
if (op_name == "Constant")
{
const Shape& shape = node->get_outputs()[0].get_shape();
size_t const_size = node->get_outputs()[0].get_element_type().size();
if (shape.size() == 0)
{
total_constant_bytes += const_size;
}
else
{
total_constant_bytes +=
(const_size * shape_size(node->get_outputs()[0].get_shape()));
}
}
}
cout << "Total Constant size: " << total_constant_bytes << " bytes\n";
for (const pair<string, size_t>& op_info : op_list)
{
cout << op_info.first << ": " << op_info.second << " ops" << endl;
}
}
else if (iterations > 0)
{
cout << "Benchmarking " << model << ", " << backend << " backend, " << iterations
<< " iterations.\n";
run_benchmark(f, backend, iterations, timing_detail);
}
return 0;
} }
...@@ -5370,7 +5370,6 @@ TEST(${BACKEND_NAME}, numeric_double_inf) ...@@ -5370,7 +5370,6 @@ TEST(${BACKEND_NAME}, numeric_double_inf)
TEST(${BACKEND_NAME}, abc_tbb) TEST(${BACKEND_NAME}, abc_tbb)
{ {
SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
ONLY_ENABLE_TEST_FOR("CPU", "${BACKEND_NAME}"); ONLY_ENABLE_TEST_FOR("CPU", "${BACKEND_NAME}");
// Force TBB flow graph generation in the CPU backend // Force TBB flow graph generation in the CPU backend
......
...@@ -109,14 +109,10 @@ TEST(cpu_fusion, gemm_cpu_broadcast_row) ...@@ -109,14 +109,10 @@ TEST(cpu_fusion, gemm_cpu_broadcast_row)
auto A = make_shared<op::Parameter>(element::f32, shapeA); auto A = make_shared<op::Parameter>(element::f32, shapeA);
auto B = make_shared<op::Parameter>(element::f32, shapeB); auto B = make_shared<op::Parameter>(element::f32, shapeB);
auto reshape_w = make_shared<op::Reshape>(A, AxisVector{1, 0}, Shape{2, 3}); auto bias = op::Constant::create<float>(element::f32, Shape{2}, std::vector<float>{2.0f, 3.0f});
auto reshape_x = make_shared<op::Reshape>(B, AxisVector{1, 0}, Shape{3, 2});
auto one = op::Constant::create<float>(element::f32, Shape{2}, std::vector<float>{1.0f, 1.0f});
auto broadcast = make_shared<op::Broadcast>(one, shapeC, AxisSet{0});
auto cg = make_shared<op::MatmulBias>( auto cg = make_shared<op::MatmulBias>(
A, B, one, A->get_shape(), B->get_shape(), true, true, AxisSet{0}); A, B, bias, A->get_shape(), B->get_shape(), true, true, AxisSet{0});
auto f = make_shared<Function>(cg, op::ParameterVector{A, B}); auto f = make_shared<Function>(cg, op::ParameterVector{A, B});
...@@ -136,8 +132,8 @@ TEST(cpu_fusion, gemm_cpu_broadcast_row) ...@@ -136,8 +132,8 @@ TEST(cpu_fusion, gemm_cpu_broadcast_row)
copy_data(b, dataB); copy_data(b, dataB);
cf->call({a, b}, {result}); cf->call({a, b}, {result});
vector<float> expected{10, 28, 37, 109}; vector<float> expected{11, 30, 38, 111};
ASSERT_TRUE(read_vector<float>(result) == expected); EXPECT_EQ(read_vector<float>(result), expected);
} }
TEST(cpu_fusion, gemm_cpu_broadcast_column) TEST(cpu_fusion, gemm_cpu_broadcast_column)
...@@ -148,14 +144,10 @@ TEST(cpu_fusion, gemm_cpu_broadcast_column) ...@@ -148,14 +144,10 @@ TEST(cpu_fusion, gemm_cpu_broadcast_column)
auto A = make_shared<op::Parameter>(element::f32, shapeA); auto A = make_shared<op::Parameter>(element::f32, shapeA);
auto B = make_shared<op::Parameter>(element::f32, shapeB); auto B = make_shared<op::Parameter>(element::f32, shapeB);
auto reshape_w = make_shared<op::Reshape>(A, AxisVector{1, 0}, Shape{2, 3}); auto bias = op::Constant::create<float>(element::f32, Shape{2}, std::vector<float>{2.0f, 3.0f});
auto reshape_x = make_shared<op::Reshape>(B, AxisVector{1, 0}, Shape{3, 2});
auto one = op::Constant::create<float>(element::f32, Shape{2}, std::vector<float>{1.0f, 1.0f});
auto broadcast = make_shared<op::Broadcast>(one, shapeC, AxisSet{1});
auto cg = make_shared<op::MatmulBias>( auto cg = make_shared<op::MatmulBias>(
A, B, one, A->get_shape(), B->get_shape(), true, true, AxisSet{1}); A, B, bias, A->get_shape(), B->get_shape(), true, true, AxisSet{1});
auto f = make_shared<Function>(cg, op::ParameterVector{A, B}); auto f = make_shared<Function>(cg, op::ParameterVector{A, B});
...@@ -175,8 +167,8 @@ TEST(cpu_fusion, gemm_cpu_broadcast_column) ...@@ -175,8 +167,8 @@ TEST(cpu_fusion, gemm_cpu_broadcast_column)
copy_data(b, dataB); copy_data(b, dataB);
cf->call({a, b}, {result}); cf->call({a, b}, {result});
vector<float> expected{10, 28, 37, 109}; vector<float> expected{11, 29, 39, 111};
ASSERT_TRUE(read_vector<float>(result) == expected); EXPECT_EQ(read_vector<float>(result), expected);
} }
TEST(cpu_fusion, gemm_cpu_broadcast_matrix) TEST(cpu_fusion, gemm_cpu_broadcast_matrix)
......
...@@ -17,56 +17,127 @@ ...@@ -17,56 +17,127 @@
#include <iomanip> #include <iomanip>
#include "benchmark.hpp" #include "benchmark.hpp"
#include "ngraph/graph_util.hpp"
#include "ngraph/runtime/backend.hpp" #include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/call_frame.hpp" #include "ngraph/runtime/call_frame.hpp"
#include "ngraph/runtime/external_function.hpp"
#include "ngraph/runtime/manager.hpp" #include "ngraph/runtime/manager.hpp"
#include "ngraph/runtime/tensor_view.hpp" #include "ngraph/runtime/tensor_view.hpp"
#include "ngraph/serializer.hpp" #include "ngraph/serializer.hpp"
#include "ngraph/util.hpp" #include "ngraph/util.hpp"
#include "random.hpp" #include "random.hpp"
std::multimap<size_t, std::string> using namespace std;
aggregate_timing(const std::vector<ngraph::runtime::PerformanceCounter>& perf_data) using namespace ngraph;
shared_ptr<Node> find_node(const string& name, shared_ptr<Function> func)
{ {
std::unordered_map<std::string, size_t> timing; static unordered_map<string, shared_ptr<Node>> node_map;
for (const ngraph::runtime::PerformanceCounter& p : perf_data) if (node_map.empty())
{ {
std::string op = p.name().substr(0, p.name().find('_')); vector<shared_ptr<Function>> fs;
timing[op] += p.microseconds(); traverse_functions(func, [&](shared_ptr<Function> f) { fs.push_back(f); });
for (shared_ptr<Function> f : fs)
{
for (shared_ptr<Node> node : f->get_ops())
{
node_map.insert({node->get_name(), node});
}
} }
}
return node_map[name];
}
std::multimap<size_t, std::string> rc; multimap<size_t, string>
for (const std::pair<std::string, size_t>& t : timing) aggregate_timing_details(const vector<runtime::PerformanceCounter>& perf_data,
shared_ptr<Function> f)
{
unordered_map<string, size_t> timing;
for (const runtime::PerformanceCounter& p : perf_data)
{
shared_ptr<Node> node = find_node(p.name(), f);
string op = p.name().substr(0, p.name().find('_'));
string shape_name = "{" + join(node->get_outputs()[0].get_shape()) + "}";
timing[op + shape_name] += p.microseconds();
}
multimap<size_t, string> rc;
for (const pair<string, size_t>& t : timing)
{ {
rc.insert({t.second, t.first}); rc.insert({t.second, t.first});
} }
return rc; return rc;
} }
void run_benchmark(const std::string& json_path, const std::string& backend_name, size_t iterations) multimap<size_t, string> aggregate_timing(const vector<runtime::PerformanceCounter>& perf_data)
{ {
using namespace std; unordered_map<string, size_t> timing;
using namespace ngraph; for (const runtime::PerformanceCounter& p : perf_data)
string env_var_name = "NGRAPH_" + backend_name + "_EMIT_TIMING";
bool emit_timing = (std::getenv(env_var_name.c_str()) != nullptr);
if (!emit_timing)
{ {
cout << "To get per-op timing set the environment variable " << env_var_name << "\n"; string op = p.name().substr(0, p.name().find('_'));
timing[op] += p.microseconds();
} }
ngraph::test::Uniform<float> rng{-1, 1, 0}; multimap<size_t, string> rc;
for (const pair<string, size_t>& t : timing)
{
rc.insert({t.second, t.first});
}
return rc;
}
void run_benchmark(const string& json_path,
const string& backend_name,
size_t iterations,
bool timing_detail)
{
stopwatch timer;
timer.start();
const string json_string = file_util::read_file_to_string(json_path); const string json_string = file_util::read_file_to_string(json_path);
stringstream ss(json_string); stringstream ss(json_string);
shared_ptr<Function> f = deserialize(ss); shared_ptr<Function> f = deserialize(ss);
timer.stop();
cout << "deserialize time: " << timer.get_milliseconds() << "ms" << endl;
run_benchmark(f, backend_name, iterations, timing_detail);
}
void print_times(const multimap<size_t, string>& timing)
{
// set the column widths
int name_width = 0;
int time_width = 0;
for (const pair<size_t, string>& p : timing)
{
name_width = max(name_width, static_cast<int>(p.second.size()));
stringstream ss;
ss.imbue(locale(""));
ss << p.first;
time_width = max(time_width, static_cast<int>(ss.str().size()));
}
for (auto it = timing.rbegin(); it != timing.rend(); it++)
{
cout << setw(name_width + 2) << left << it->second << " " << setw(time_width + 2) << right
<< it->first << "us\n";
}
}
stopwatch build_time; void run_benchmark(shared_ptr<Function> f,
build_time.start(); const string& backend_name,
size_t iterations,
bool timing_detail)
{
test::Uniform<float> rng{-1, 1, 0};
stopwatch timer;
timer.start();
auto manager = runtime::Manager::get(backend_name); auto manager = runtime::Manager::get(backend_name);
auto external = manager->compile(f); auto external = manager->compile(f);
external->set_emit_timing(timing_detail);
auto backend = manager->allocate_backend(); auto backend = manager->allocate_backend();
auto cf = backend->make_call_frame(external); auto cf = backend->make_call_frame(external);
build_time.stop(); timer.stop();
cout << "build_time " << build_time.get_milliseconds() << "ms" << endl; cout.imbue(locale(""));
cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;
vector<shared_ptr<runtime::TensorView>> args; vector<shared_ptr<runtime::TensorView>> args;
for (shared_ptr<op::Parameter> param : f->get_parameters()) for (shared_ptr<op::Parameter> param : f->get_parameters())
...@@ -100,9 +171,11 @@ void run_benchmark(const std::string& json_path, const std::string& backend_name ...@@ -100,9 +171,11 @@ void run_benchmark(const std::string& json_path, const std::string& backend_name
return p1.total_microseconds() > p2.total_microseconds(); return p1.total_microseconds() > p2.total_microseconds();
}); });
multimap<size_t, string> timing = aggregate_timing(perf_data); multimap<size_t, string> timing = aggregate_timing(perf_data);
for (auto it = timing.rbegin(); it != timing.rend(); it++) multimap<size_t, string> timing_details = aggregate_timing_details(perf_data, f);
{
cout.imbue(locale("")); cout << "\n---- Aggregate times per op type ----\n";
cout << setw(15) << left << it->second << " " << setw(10) << right << it->first << "us\n"; print_times(timing);
}
cout << "\n---- Aggregate times per op type/shape ----\n";
print_times(timing_details);
} }
...@@ -18,13 +18,21 @@ ...@@ -18,13 +18,21 @@
#include <map> #include <map>
#include <ngraph/function.hpp>
#include <ngraph/runtime/call_frame.hpp> #include <ngraph/runtime/call_frame.hpp>
#include "test_tools.hpp" #include "test_tools.hpp"
/// performance test utilities /// performance test utilities
std::multimap<size_t, std::string> std::multimap<size_t, std::string>
aggregate_timing(const std::vector<ngraph::runtime::PerformanceCounter>& perf_data); aggregate_timing(const std::vector<ngraph::runtime::PerformanceCounter>& perf_data);
void run_benchmark(std::shared_ptr<ngraph::Function> f,
const std::string& backend_name,
size_t iterations,
bool timing_detail);
void run_benchmark(const std::string& json_path, void run_benchmark(const std::string& json_path,
const std::string& backend_name, const std::string& backend_name,
size_t iterations); size_t iterations,
bool timing_detail = false);
...@@ -36,11 +36,8 @@ ...@@ -36,11 +36,8 @@
#define ONLY_ENABLE_TEST_FOR(backend_to_enable, current_backend) \ #define ONLY_ENABLE_TEST_FOR(backend_to_enable, current_backend) \
if (backend_to_enable != current_backend) \ if (backend_to_enable != current_backend) \
{ \ { \
NGRAPH_INFO << "Skipped test for " << current_backend; \
return; \ return; \
} \
else \
{ \
NGRAPH_INFO << "Enabled test for " << current_backend; \
} }
namespace ngraph namespace ngraph
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment