Unverified Commit d34fb157 authored by Robert Kimball's avatar Robert Kimball Committed by GitHub

Merge pull request #3284 from NervanaSystems/bob/nbench_db

Add nbench support for double buffering
parents a509de7b 47626835
...@@ -463,6 +463,8 @@ set (SRC ...@@ -463,6 +463,8 @@ set (SRC
runtime/backend.hpp runtime/backend.hpp
runtime/backend_manager.cpp runtime/backend_manager.cpp
runtime/backend_manager.hpp runtime/backend_manager.hpp
runtime/chrome_trace.cpp
runtime/chrome_trace.hpp
runtime/executable.cpp runtime/executable.cpp
runtime/executable.hpp runtime/executable.hpp
runtime/host_tensor.cpp runtime/host_tensor.cpp
......
//*****************************************************************************
// Copyright 2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <iostream>
#include <map>
#include <sstream>
#include <string>
#include "chrome_trace.hpp"
#include "ngraph/log.hpp"
using namespace std;
using namespace ngraph;
static bool read_tracing_env_var()
{
static const bool is_enabled = (getenv("NGRAPH_ENABLE_TRACING") != nullptr);
return is_enabled;
}
mutex runtime::event::Manager::s_file_mutex;
bool runtime::event::Manager::s_tracing_enabled = read_tracing_env_var();
runtime::event::Duration::Duration(const string& name, const string& category, const string& args)
{
if (Manager::is_tracing_enabled())
{
m_start = Manager::get_current_microseconds();
m_stop = 0;
m_name = name;
m_category = category;
m_args = args;
}
}
void runtime::event::Duration::stop()
{
if (Manager::is_tracing_enabled())
{
m_stop = Manager::get_current_microseconds();
}
}
void runtime::event::Duration::write()
{
if (Manager::is_tracing_enabled())
{
size_t stop_time = (m_stop != 0 ? m_stop : Manager::get_current_microseconds());
lock_guard<mutex> lock(Manager::get_mutex());
ofstream& out = runtime::event::Manager::get_output_stream();
if (out.is_open() == false)
{
runtime::event::Manager::open();
}
else
{
Manager::get_output_stream() << ",\n";
}
Manager::get_output_stream() <<
R"({"name":")" << m_name << R"(","cat":")" << m_category << R"(","ph":"X","pid":)"
<< Manager::get_process_id() << R"(,"tid":)"
<< Manager::get_thread_id() <<
R"(,"ts":)" << m_start << R"(,"dur":)" << (stop_time - m_start);
if (!m_args.empty())
{
out <<
R"(,"args":)" << m_args;
}
out << "}";
}
}
runtime::event::Object::Object(const string& name, const string& args)
: m_name{name}
, m_id{static_cast<size_t>(chrono::high_resolution_clock::now().time_since_epoch().count())}
{
if (Manager::is_tracing_enabled())
{
lock_guard<mutex> lock(Manager::get_mutex());
ofstream& out = runtime::event::Manager::get_output_stream();
if (out.is_open() == false)
{
runtime::event::Manager::open();
}
else
{
Manager::get_output_stream() << ",\n";
}
out << R"({"name":")" << m_name << R"(","ph":"N","id":")" << m_id <<
R"(","ts":)" << Manager::get_current_microseconds() <<
R"(,"pid":)" << Manager::get_process_id() << R"(,"tid":)" << Manager::get_thread_id();
if (!args.empty())
{
out <<
R"(,"args":)" << args;
}
out << "}";
write_snapshot(out, args);
}
}
void runtime::event::Object::snapshot(const string& args)
{
if (Manager::is_tracing_enabled())
{
lock_guard<mutex> lock(Manager::get_mutex());
ofstream& out = runtime::event::Manager::get_output_stream();
if (out.is_open() == false)
{
runtime::event::Manager::open();
}
else
{
Manager::get_output_stream() << ",\n";
}
write_snapshot(out, args);
}
}
void runtime::event::Object::write_snapshot(ostream& out, const string& args)
{
out << R"({"name":")" << m_name << R"(","ph":"O","id":")" << m_id <<
R"(","ts":)" << Manager::get_current_microseconds() <<
R"(,"pid":)" << Manager::get_process_id() << R"(,"tid":)" << Manager::get_thread_id();
if (!args.empty())
{
out <<
R"(,"args":)" << args;
}
out << "}";
}
void runtime::event::Object::destroy()
{
if (Manager::is_tracing_enabled())
{
lock_guard<mutex> lock(Manager::get_mutex());
ofstream& out = runtime::event::Manager::get_output_stream();
if (out.is_open() == false)
{
runtime::event::Manager::open();
}
else
{
Manager::get_output_stream() << ",\n";
}
out << R"({"name":")" << m_name << R"(","ph":"D","id":")" << m_id <<
R"(","ts":)" << Manager::get_current_microseconds() <<
R"(,"pid":)" << Manager::get_process_id() << R"(,"tid":)" << Manager::get_thread_id()
<< "}";
}
}
void runtime::event::Manager::open(const string& path)
{
ofstream& out = get_output_stream();
if (out.is_open() == false)
{
out.open(path, ios_base::trunc);
out << "[\n";
}
}
void runtime::event::Manager::close()
{
ofstream& out = get_output_stream();
if (out.is_open())
{
out << "\n]\n";
out.close();
}
}
ofstream& runtime::event::Manager::get_output_stream()
{
static ofstream s_event_log;
return s_event_log;
}
const string& runtime::event::Manager::get_process_id()
{
static const string s_pid = to_string(getpid());
return s_pid;
}
void runtime::event::Manager::enable_event_tracing()
{
s_tracing_enabled = true;
}
void runtime::event::Manager::disable_event_tracing()
{
s_tracing_enabled = false;
}
bool runtime::event::Manager::is_event_tracing_enabled()
{
return s_tracing_enabled;
}
string runtime::event::Manager::get_thread_id()
{
thread::id tid = this_thread::get_id();
static map<thread::id, string> tid_map;
auto it = tid_map.find(tid);
string rc;
if (it == tid_map.end())
{
stringstream ss;
ss << "\"" << tid << "\"";
rc = ss.str();
tid_map.insert({tid, rc});
}
else
{
rc = it->second;
}
return rc;
}
//*****************************************************************************
// Copyright 2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <chrono>
#include <fstream>
#include <iostream>
#include <mutex>
#include <string>
#include <thread>
#ifdef _WIN32
#include <windows.h>
// windows.h must be before processthreadsapi.h so we need this comment
#include <processthreadsapi.h>
#define getpid() GetCurrentProcessId()
#else
#include <unistd.h>
#endif
namespace ngraph
{
namespace runtime
{
namespace event
{
class Duration;
class Object;
class Manager;
}
}
}
//
// This class records timestamps for a given user defined event and
// produces output in the chrome tracing format that can be used to view
// the events of a running program
//
// Following is the format of a trace event
//
// {
// "name": "myName",
// "cat": "category,list",
// "ph": "B",
// "ts": 12345,
// "pid": 123,
// "tid": 456,
// "args": {
// "someArg": 1,
// "anotherArg": {
// "value": "my value"
// }
// }
// }
//
// The trace file format is defined here:
// https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview
//
// The trace file can be viewed by Chrome browser using the
// URL: chrome://tracing/
//
// More information about this is at:
// http://dev.chromium.org/developers/how-tos/trace-event-profiling-tool
class ngraph::runtime::event::Manager
{
friend class Duration;
friend class Object;
public:
static void open(const std::string& path = "runtime_event_trace.json");
static void close();
static bool is_tracing_enabled() { return s_tracing_enabled; }
static void enable_event_tracing();
static void disable_event_tracing();
static bool is_event_tracing_enabled();
private:
static std::ofstream& get_output_stream();
static const std::string& get_process_id();
static size_t get_current_microseconds()
{
return std::chrono::high_resolution_clock::now().time_since_epoch().count() / 1000;
}
static std::string get_thread_id();
static std::mutex& get_mutex() { return s_file_mutex; }
static std::ostream s_ostream;
static std::mutex s_file_mutex;
static bool s_tracing_enabled;
};
class ngraph::runtime::event::Duration
{
public:
explicit Duration(const std::string& name,
const std::string& category,
const std::string& args = "");
~Duration() { write(); }
/// \brief stop the timer without writing the data to the log file. To write the data
/// call the `write` method
/// Calls to stop() are optional
void stop();
/// \brief write the log data to the log file for this event
/// This funtion has an implicit stop() if stop() has not been previously called
void write();
Duration(const Duration&) = delete;
Duration& operator=(Duration const&) = delete;
private:
std::string to_json() const;
size_t m_start;
size_t m_stop;
std::string m_name;
std::string m_category;
std::string m_args;
};
class ngraph::runtime::event::Object
{
public:
Object(const std::string& name, const std::string& args);
void snapshot(const std::string& args);
void destroy();
private:
void write_snapshot(std::ostream& out, const std::string& args);
const std::string m_name;
size_t m_id;
};
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <memory> #include <memory>
#include "ngraph/descriptor/layout/dense_tensor_layout.hpp" #include "ngraph/descriptor/layout/dense_tensor_layout.hpp"
#include "ngraph/runtime/chrome_trace.hpp"
#include "ngraph/runtime/host_tensor.hpp" #include "ngraph/runtime/host_tensor.hpp"
#include "ngraph/util.hpp" #include "ngraph/util.hpp"
...@@ -96,6 +97,8 @@ const char* runtime::HostTensor::get_data_ptr() const ...@@ -96,6 +97,8 @@ const char* runtime::HostTensor::get_data_ptr() const
void runtime::HostTensor::write(const void* source, size_t n) void runtime::HostTensor::write(const void* source, size_t n)
{ {
runtime::event::Duration d1("write", "HostTensor");
if (n > m_buffer_size) if (n > m_buffer_size)
{ {
throw out_of_range("write access past end of tensor"); throw out_of_range("write access past end of tensor");
...@@ -106,6 +109,7 @@ void runtime::HostTensor::write(const void* source, size_t n) ...@@ -106,6 +109,7 @@ void runtime::HostTensor::write(const void* source, size_t n)
void runtime::HostTensor::read(void* target, size_t n) const void runtime::HostTensor::read(void* target, size_t n) const
{ {
runtime::event::Duration d1("read", "HostTensor");
if (n > m_buffer_size) if (n > m_buffer_size)
{ {
throw out_of_range("read access past end of tensor"); throw out_of_range("read access past end of tensor");
......
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#include "ngraph/pass/manager.hpp" #include "ngraph/pass/manager.hpp"
#include "ngraph/pass/memory_layout.hpp" #include "ngraph/pass/memory_layout.hpp"
#include "ngraph/runtime/backend_manager.hpp" #include "ngraph/runtime/backend_manager.hpp"
#include "ngraph/runtime/chrome_trace.hpp"
#include "ngraph/serializer.hpp" #include "ngraph/serializer.hpp"
#include "ngraph/util.hpp" #include "ngraph/util.hpp"
...@@ -74,6 +75,8 @@ runtime::interpreter::INTExecutable::INTExecutable(const std::string& model_stri ...@@ -74,6 +75,8 @@ runtime::interpreter::INTExecutable::INTExecutable(const std::string& model_stri
bool runtime::interpreter::INTExecutable::call(const vector<shared_ptr<runtime::Tensor>>& outputs, bool runtime::interpreter::INTExecutable::call(const vector<shared_ptr<runtime::Tensor>>& outputs,
const vector<shared_ptr<runtime::Tensor>>& inputs) const vector<shared_ptr<runtime::Tensor>>& inputs)
{ {
runtime::event::Duration d1("call", "Interpreter");
// convert inputs to HostTensor // convert inputs to HostTensor
vector<shared_ptr<HostTensor>> func_inputs; vector<shared_ptr<HostTensor>> func_inputs;
for (auto tensor : inputs) for (auto tensor : inputs)
...@@ -122,6 +125,7 @@ bool runtime::interpreter::INTExecutable::call(const vector<shared_ptr<runtime:: ...@@ -122,6 +125,7 @@ bool runtime::interpreter::INTExecutable::call(const vector<shared_ptr<runtime::
for (const NodeWrapper& wrapped : m_wrapped_nodes) for (const NodeWrapper& wrapped : m_wrapped_nodes)
{ {
auto op = wrapped.get_node(); auto op = wrapped.get_node();
runtime::event::Duration d2(op->description(), "Interpreter");
auto type_id = wrapped.get_typeid(); auto type_id = wrapped.get_typeid();
if (type_id == OP_TYPEID::Parameter) if (type_id == OP_TYPEID::Parameter)
{ {
...@@ -304,3 +308,74 @@ void runtime::interpreter::INTExecutable::save(ostream& out) ...@@ -304,3 +308,74 @@ void runtime::interpreter::INTExecutable::save(ostream& out)
string model = serialize(m_function, 0); string model = serialize(m_function, 0);
writer.write("model", model.data(), model.size()); writer.write("model", model.data(), model.size());
} }
shared_ptr<ngraph::op::Parameter>
runtime::interpreter::INTExecutable::get_parameter(size_t index) const
{
const ParameterVector& parameters = get_parameters();
NGRAPH_CHECK(index < parameters.size(), "create_tensor for input out of bounds");
return parameters[index];
}
shared_ptr<ngraph::op::Result> runtime::interpreter::INTExecutable::get_result(size_t index) const
{
const ResultVector& results = get_results();
NGRAPH_CHECK(index < results.size(), "create_tensor for input out of bounds");
return results[index];
}
shared_ptr<runtime::Tensor>
runtime::interpreter::INTExecutable::create_input_tensor(size_t input_index)
{
shared_ptr<op::Parameter> parameter = get_parameter(input_index);
return make_shared<runtime::HostTensor>(parameter->get_element_type(), parameter->get_shape());
}
shared_ptr<runtime::Tensor>
runtime::interpreter::INTExecutable::create_output_tensor(size_t output_index)
{
shared_ptr<op::Result> result = get_result(output_index);
return make_shared<runtime::HostTensor>(result->get_element_type(), result->get_shape());
}
vector<shared_ptr<runtime::Tensor>>
runtime::interpreter::INTExecutable::create_input_tensor(size_t input_index,
size_t pipeline_depth)
{
vector<shared_ptr<runtime::HostTensor>> tensors;
shared_ptr<op::Parameter> parameter = get_parameter(input_index);
for (size_t i = 0; i < pipeline_depth; i++)
{
shared_ptr<runtime::HostTensor> tensor;
auto t =
make_shared<runtime::HostTensor>(parameter->get_element_type(), parameter->get_shape());
tensor = static_pointer_cast<runtime::HostTensor>(t);
tensors.push_back(tensor);
}
vector<shared_ptr<runtime::Tensor>> result_tensors;
for (const shared_ptr<runtime::HostTensor>& tensor : tensors)
{
result_tensors.push_back(tensor);
}
return result_tensors;
}
vector<shared_ptr<runtime::Tensor>>
runtime::interpreter::INTExecutable::create_output_tensor(size_t output_index,
size_t pipeline_depth)
{
vector<shared_ptr<runtime::HostTensor>> tensors;
shared_ptr<op::Result> result = get_result(output_index);
for (size_t i = 0; i < pipeline_depth; i++)
{
shared_ptr<runtime::HostTensor> tensor;
auto t = make_shared<runtime::HostTensor>(result->get_element_type(), result->get_shape());
tensor = static_pointer_cast<runtime::HostTensor>(t);
tensors.push_back(tensor);
}
vector<shared_ptr<runtime::Tensor>> result_tensors;
for (const shared_ptr<runtime::HostTensor>& tensor : tensors)
{
result_tensors.push_back(tensor);
}
return result_tensors;
}
...@@ -186,9 +186,21 @@ public: ...@@ -186,9 +186,21 @@ public:
std::vector<PerformanceCounter> get_performance_data() const override; std::vector<PerformanceCounter> get_performance_data() const override;
std::shared_ptr<runtime::Tensor> create_input_tensor(size_t input_index) override;
std::shared_ptr<runtime::Tensor> create_output_tensor(size_t output_index) override;
std::vector<std::shared_ptr<runtime::Tensor>>
create_input_tensor(size_t input_index, size_t pipeline_depth) override;
std::vector<std::shared_ptr<runtime::Tensor>>
create_output_tensor(size_t output_index, size_t pipeline_depth) override;
private: private:
INTExecutable(const std::string& model_string); INTExecutable(const std::string& model_string);
std::shared_ptr<ngraph::op::Parameter> get_parameter(size_t index) const;
std::shared_ptr<ngraph::op::Result> get_result(size_t index) const;
int get_alignment() const { return 64; } int get_alignment() const { return 64; }
bool m_is_compiled = false; bool m_is_compiled = false;
bool m_nan_check_enabled = false; bool m_nan_check_enabled = false;
......
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
set (SRC set (SRC
nbench.cpp nbench.cpp
benchmark.cpp benchmark.cpp
benchmark_pipelined.cpp
benchmark_utils.cpp
) )
add_executable(nbench ${SRC}) add_executable(nbench ${SRC})
......
...@@ -14,12 +14,8 @@ ...@@ -14,12 +14,8 @@
// limitations under the License. // limitations under the License.
//***************************************************************************** //*****************************************************************************
#include <random>
#if defined(__x86_64__) || defined(__amd64__)
#include <xmmintrin.h>
#endif
#include "benchmark.hpp" #include "benchmark.hpp"
#include "benchmark_utils.hpp"
#include "ngraph/file_util.hpp" #include "ngraph/file_util.hpp"
#include "ngraph/runtime/backend.hpp" #include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/host_tensor.hpp" #include "ngraph/runtime/host_tensor.hpp"
...@@ -30,106 +26,6 @@ ...@@ -30,106 +26,6 @@
using namespace std; using namespace std;
using namespace ngraph; using namespace ngraph;
static default_random_engine s_random_engine;
void set_denormals_flush_to_zero()
{
#if defined(__x86_64__) || defined(__amd64__)
// Avoids perf impact from denormals while benchmarking with random data
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
#endif
}
template <typename T>
void init_int_tv(shared_ptr<runtime::Tensor> tv, T min, T max)
{
size_t size = tv->get_element_count();
uniform_int_distribution<T> dist(min, max);
vector<T> vec(size);
for (T& element : vec)
{
element = dist(s_random_engine);
}
tv->write(vec.data(), vec.size() * sizeof(T));
}
template <>
void init_int_tv<char>(shared_ptr<runtime::Tensor> tv, char min, char max)
{
size_t size = tv->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<char> vec(size);
for (char& element : vec)
{
element = static_cast<char>(dist(s_random_engine));
}
tv->write(vec.data(), vec.size() * sizeof(char));
}
template <>
void init_int_tv<int8_t>(shared_ptr<runtime::Tensor> tv, int8_t min, int8_t max)
{
size_t size = tv->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<int8_t> vec(size);
for (int8_t& element : vec)
{
element = static_cast<int8_t>(dist(s_random_engine));
}
tv->write(vec.data(), vec.size() * sizeof(int8_t));
}
template <>
void init_int_tv<uint8_t>(shared_ptr<runtime::Tensor> tv, uint8_t min, uint8_t max)
{
size_t size = tv->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<uint8_t> vec(size);
for (uint8_t& element : vec)
{
element = static_cast<uint8_t>(dist(s_random_engine));
}
tv->write(vec.data(), vec.size() * sizeof(uint8_t));
}
template <typename T>
void init_real_tv(shared_ptr<runtime::Tensor> tv, T min, T max)
{
size_t size = tv->get_element_count();
uniform_real_distribution<T> dist(min, max);
vector<T> vec(size);
for (T& element : vec)
{
element = dist(s_random_engine);
}
tv->write(vec.data(), vec.size() * sizeof(T));
}
static void random_init(shared_ptr<runtime::Tensor> tv)
{
element::Type et = tv->get_element_type();
switch (et.get_type_enum())
{
case element::Type_t::boolean: init_int_tv<char>(tv, 0, 1); break;
case element::Type_t::f32: init_real_tv<float>(tv, -1, 1); break;
case element::Type_t::f64: init_real_tv<double>(tv, -1, 1); break;
case element::Type_t::i8: init_int_tv<int8_t>(tv, -1, 1); break;
case element::Type_t::i16: init_int_tv<int16_t>(tv, -1, 1); break;
case element::Type_t::i32: init_int_tv<int32_t>(tv, 0, 1); break;
case element::Type_t::i64: init_int_tv<int64_t>(tv, 0, 1); break;
case element::Type_t::u8: init_int_tv<uint8_t>(tv, 0, 1); break;
case element::Type_t::u16: init_int_tv<uint16_t>(tv, 0, 1); break;
case element::Type_t::u32: init_int_tv<uint32_t>(tv, 0, 1); break;
case element::Type_t::u64: init_int_tv<uint64_t>(tv, 0, 1); break;
case element::Type_t::undefined:
case element::Type_t::dynamic:
case element::Type_t::bf16:
case element::Type_t::f16:
default: throw runtime_error("unsupported type");
}
}
vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f, vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
const string& backend_name, const string& backend_name,
size_t iterations, size_t iterations,
...@@ -140,7 +36,7 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f, ...@@ -140,7 +36,7 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
stopwatch timer; stopwatch timer;
timer.start(); timer.start();
auto backend = runtime::Backend::create(backend_name); auto backend = runtime::Backend::create(backend_name);
auto compiled_func = backend->compile(f, timing_detail); auto exec = backend->compile(f, timing_detail);
timer.stop(); timer.stop();
cout.imbue(locale("")); cout.imbue(locale(""));
cout << "compile time: " << timer.get_milliseconds() << "ms" << endl; cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;
...@@ -201,7 +97,7 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f, ...@@ -201,7 +97,7 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
} }
} }
} }
compiled_func->call(results, args); exec->call(results, args);
if (copy_data) if (copy_data)
{ {
for (size_t result_index = 0; result_index < results.size(); result_index++) for (size_t result_index = 0; result_index < results.size(); result_index++)
...@@ -217,6 +113,6 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f, ...@@ -217,6 +113,6 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
float time = t1.get_milliseconds(); float time = t1.get_milliseconds();
cout << time / iterations << "ms per iteration" << endl; cout << time / iterations << "ms per iteration" << endl;
vector<runtime::PerformanceCounter> perf_data = compiled_func->get_performance_data(); vector<runtime::PerformanceCounter> perf_data = exec->get_performance_data();
return perf_data; return perf_data;
} }
...@@ -24,10 +24,6 @@ ...@@ -24,10 +24,6 @@
#include "ngraph/function.hpp" #include "ngraph/function.hpp"
#include "ngraph/runtime/performance_counter.hpp" #include "ngraph/runtime/performance_counter.hpp"
/// performance test utilities
std::multimap<size_t, std::string>
aggregate_timing(const std::vector<ngraph::runtime::PerformanceCounter>& perf_data);
std::vector<ngraph::runtime::PerformanceCounter> run_benchmark(std::shared_ptr<ngraph::Function> f, std::vector<ngraph::runtime::PerformanceCounter> run_benchmark(std::shared_ptr<ngraph::Function> f,
const std::string& backend_name, const std::string& backend_name,
size_t iterations, size_t iterations,
......
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <array>
#include <condition_variable>
#include <mutex>
#include <thread>
#include "benchmark.hpp"
#include "benchmark_utils.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/host_tensor.hpp"
#include "ngraph/runtime/tensor.hpp"
#include "ngraph/serializer.hpp"
#include "ngraph/util.hpp"
using namespace std;
using namespace ngraph;
class TensorCollection
{
public:
vector<shared_ptr<runtime::HostTensor>> parameter_data;
vector<shared_ptr<runtime::HostTensor>> result_data;
vector<shared_ptr<runtime::Tensor>> input_tensors;
vector<shared_ptr<runtime::Tensor>> output_tensors;
private:
};
static mutex s_mutex;
static condition_variable s_condition;
static size_t current_iteration = 0;
static size_t s_iterations;
static size_t s_warmup_iterations;
static stopwatch s_timer;
static void
thread_entry(runtime::Executable* exec, const TensorCollection& tensors, size_t pipeline_stage)
{
bool data_written = false;
const vector<shared_ptr<runtime::Tensor>>& args = tensors.input_tensors;
const vector<shared_ptr<runtime::Tensor>>& results = tensors.output_tensors;
while (current_iteration < s_iterations + s_warmup_iterations)
{
if (!data_written)
{
for (size_t arg_index = 0; arg_index < args.size(); arg_index++)
{
const shared_ptr<runtime::Tensor>& arg = args[arg_index];
if (arg->get_stale())
{
const shared_ptr<runtime::HostTensor>& data = tensors.parameter_data[arg_index];
arg->write(data->get_data_ptr(),
data->get_element_count() * data->get_element_type().size());
}
}
data_written = true;
}
unique_lock<mutex> lock(s_mutex);
if ((current_iteration & 1) != pipeline_stage)
{
s_condition.wait(lock);
}
else
{
if (current_iteration == s_warmup_iterations)
{
s_timer.start();
}
// our turn to run
exec->call(results, args);
current_iteration++;
data_written = false;
s_condition.notify_all();
lock.unlock();
for (size_t result_index = 0; result_index < results.size(); result_index++)
{
const shared_ptr<runtime::HostTensor>& data = tensors.result_data[result_index];
const shared_ptr<runtime::Tensor>& result = results[result_index];
result->read(data->get_data_ptr(),
data->get_element_count() * data->get_element_type().size());
}
}
}
}
vector<runtime::PerformanceCounter> run_benchmark_pipelined(shared_ptr<Function> f,
const string& backend_name,
size_t iterations,
bool timing_detail,
int warmup_iterations,
bool copy_data)
{
constexpr size_t pipeline_depth = 2;
s_iterations = iterations;
s_warmup_iterations = warmup_iterations;
array<TensorCollection, pipeline_depth> tensor_collections;
stopwatch timer;
timer.start();
auto backend = runtime::Backend::create(backend_name);
auto exec = backend->compile(f, timing_detail);
timer.stop();
cout.imbue(locale(""));
cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;
set_denormals_flush_to_zero();
// Create random input data for all input tensors
for (size_t i = 0; i < pipeline_depth; i++)
{
for (shared_ptr<op::Parameter> param : f->get_parameters())
{
auto tensor_data =
make_shared<runtime::HostTensor>(param->get_element_type(), param->get_shape());
random_init(tensor_data);
tensor_collections[i].parameter_data.push_back(tensor_data);
}
}
// Create output tensors for all outputs
for (size_t i = 0; i < pipeline_depth; i++)
{
for (shared_ptr<Node> result : f->get_results())
{
auto tensor_data =
make_shared<runtime::HostTensor>(result->get_element_type(), result->get_shape());
tensor_collections[i].result_data.push_back(tensor_data);
}
}
// Create input tensors for all Parameters
array<vector<shared_ptr<runtime::Tensor>>, pipeline_depth> input_tensors_array;
size_t input_index = 0;
for (shared_ptr<op::Parameter> param : f->get_parameters())
{
auto input_tensors = exec->create_input_tensor(input_index++, pipeline_depth);
for (size_t i = 0; i < pipeline_depth; i++)
{
tensor_collections[i].input_tensors.push_back(input_tensors[i]);
}
}
// Create output tensors for all Results
array<vector<shared_ptr<runtime::Tensor>>, pipeline_depth> output_tensors_array;
size_t output_index = 0;
for (shared_ptr<Node> result : f->get_results())
{
auto output_tensors = exec->create_output_tensor(output_index++, pipeline_depth);
for (size_t i = 0; i < pipeline_depth; i++)
{
tensor_collections[i].output_tensors.push_back(output_tensors[i]);
}
}
thread threads[pipeline_depth];
for (size_t i = 0; i < pipeline_depth; i++)
{
threads[i] = thread(thread_entry, exec.get(), tensor_collections[i], i);
}
for (size_t i = 0; i < pipeline_depth; i++)
{
threads[i].join();
}
s_timer.stop();
float time = s_timer.get_milliseconds();
cout << time / iterations << "ms per iteration" << endl;
vector<runtime::PerformanceCounter> perf_data = exec->get_performance_data();
return perf_data;
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "ngraph/function.hpp"
#include "ngraph/runtime/performance_counter.hpp"
std::vector<ngraph::runtime::PerformanceCounter>
run_benchmark_pipelined(std::shared_ptr<ngraph::Function> f,
const std::string& backend_name,
size_t iterations,
bool timing_detail,
int warmup_iterations,
bool copy_data);
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#if defined(__x86_64__) || defined(__amd64__)
#include <xmmintrin.h>
#endif
#include "benchmark_utils.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/host_tensor.hpp"
#include "ngraph/runtime/tensor.hpp"
#include "ngraph/serializer.hpp"
#include "ngraph/util.hpp"
using namespace std;
using namespace ngraph;
template <>
void init_int_tensor<char>(shared_ptr<runtime::Tensor> tensor, char min, char max)
{
size_t size = tensor->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<char> vec(size);
for (char& element : vec)
{
element = static_cast<char>(dist(get_random_engine()));
}
tensor->write(vec.data(), vec.size() * sizeof(char));
}
template <>
void init_int_tensor<int8_t>(shared_ptr<runtime::Tensor> tensor, int8_t min, int8_t max)
{
size_t size = tensor->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<int8_t> vec(size);
for (int8_t& element : vec)
{
element = static_cast<int8_t>(dist(get_random_engine()));
}
tensor->write(vec.data(), vec.size() * sizeof(int8_t));
}
template <>
void init_int_tensor<uint8_t>(shared_ptr<runtime::Tensor> tensor, uint8_t min, uint8_t max)
{
size_t size = tensor->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<uint8_t> vec(size);
for (uint8_t& element : vec)
{
element = static_cast<uint8_t>(dist(get_random_engine()));
}
tensor->write(vec.data(), vec.size() * sizeof(uint8_t));
}
void set_denormals_flush_to_zero()
{
#if defined(__x86_64__) || defined(__amd64__)
// Avoids perf impact from denormals while benchmarking with random data
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
#endif
}
void random_init(shared_ptr<runtime::Tensor> tensor)
{
element::Type et = tensor->get_element_type();
#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
#pragma GCC diagnostic push
#pragma GCC diagnostic error "-Wswitch"
#pragma GCC diagnostic error "-Wswitch-enum"
#endif
switch (et.get_type_enum())
{
case element::Type_t::boolean: init_int_tensor<char>(tensor, 0, 1); break;
case element::Type_t::f32: init_real_tensor<float>(tensor, -1, 1); break;
case element::Type_t::f64: init_real_tensor<double>(tensor, -1, 1); break;
case element::Type_t::i8: init_int_tensor<int8_t>(tensor, -1, 1); break;
case element::Type_t::i16: init_int_tensor<int16_t>(tensor, -1, 1); break;
case element::Type_t::i32: init_int_tensor<int32_t>(tensor, 0, 1); break;
case element::Type_t::i64: init_int_tensor<int64_t>(tensor, 0, 1); break;
case element::Type_t::u8: init_int_tensor<uint8_t>(tensor, 0, 1); break;
case element::Type_t::u16: init_int_tensor<uint16_t>(tensor, 0, 1); break;
case element::Type_t::u32: init_int_tensor<uint32_t>(tensor, 0, 1); break;
case element::Type_t::u64: init_int_tensor<uint64_t>(tensor, 0, 1); break;
case element::Type_t::undefined:
case element::Type_t::dynamic:
case element::Type_t::bf16:
case element::Type_t::f16:
default: throw runtime_error("unsupported type");
}
#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
#pragma GCC diagnostic pop
#endif
}
default_random_engine& get_random_engine()
{
static std::default_random_engine s_random_engine;
return s_random_engine;
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <random>
#include "benchmark.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/host_tensor.hpp"
#include "ngraph/runtime/tensor.hpp"
#include "ngraph/serializer.hpp"
#include "ngraph/util.hpp"
void set_denormals_flush_to_zero();
void random_init(std::shared_ptr<ngraph::runtime::Tensor> tensor);
std::default_random_engine& get_random_engine();
template <typename T>
void init_int_tensor(std::shared_ptr<ngraph::runtime::Tensor> tensor, T min, T max)
{
size_t size = tensor->get_element_count();
std::uniform_int_distribution<T> dist(min, max);
std::vector<T> vec(size);
for (T& element : vec)
{
element = dist(get_random_engine());
}
tensor->write(vec.data(), vec.size() * sizeof(T));
}
template <typename T>
void init_real_tensor(std::shared_ptr<ngraph::runtime::Tensor> tensor, T min, T max)
{
size_t size = tensor->get_element_count();
std::uniform_real_distribution<T> dist(min, max);
std::vector<T> vec(size);
for (T& element : vec)
{
element = dist(get_random_engine());
}
tensor->write(vec.data(), vec.size() * sizeof(T));
}
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <iomanip> #include <iomanip>
#include "benchmark.hpp" #include "benchmark.hpp"
#include "benchmark_pipelined.hpp"
#include "ngraph/distributed.hpp" #include "ngraph/distributed.hpp"
#include "ngraph/except.hpp" #include "ngraph/except.hpp"
#include "ngraph/file_util.hpp" #include "ngraph/file_util.hpp"
...@@ -181,6 +182,7 @@ int main(int argc, char** argv) ...@@ -181,6 +182,7 @@ int main(int argc, char** argv)
int warmup_iterations = 1; int warmup_iterations = 1;
bool copy_data = true; bool copy_data = true;
bool dot_file = false; bool dot_file = false;
bool double_buffer = false;
for (size_t i = 1; i < argc; i++) for (size_t i = 1; i < argc; i++)
{ {
...@@ -229,6 +231,10 @@ int main(int argc, char** argv) ...@@ -229,6 +231,10 @@ int main(int argc, char** argv)
{ {
directory = argv[++i]; directory = argv[++i];
} }
else if (arg == "--double_buffer")
{
double_buffer = true;
}
else if (arg == "-w" || arg == "--warmup_iterations") else if (arg == "-w" || arg == "--warmup_iterations")
{ {
try try
...@@ -283,6 +289,7 @@ OPTIONS ...@@ -283,6 +289,7 @@ OPTIONS
-w|--warmup_iterations Number of warm-up iterations -w|--warmup_iterations Number of warm-up iterations
--no_copy_data Disable copy of input/result data every iteration --no_copy_data Disable copy of input/result data every iteration
--dot Generate Graphviz dot file --dot Generate Graphviz dot file
--double_buffer Double buffer inputs and outputs
)###"; )###";
return 1; return 1;
} }
...@@ -420,8 +427,17 @@ OPTIONS ...@@ -420,8 +427,17 @@ OPTIONS
{ {
cout << "\n---- Benchmark ----\n"; cout << "\n---- Benchmark ----\n";
shared_ptr<Function> f = deserialize(model); shared_ptr<Function> f = deserialize(model);
auto perf_data = run_benchmark( vector<runtime::PerformanceCounter> perf_data;
f, backend, iterations, timing_detail, warmup_iterations, copy_data); if (double_buffer)
{
perf_data = run_benchmark_pipelined(
f, backend, iterations, timing_detail, warmup_iterations, copy_data);
}
else
{
perf_data = run_benchmark(
f, backend, iterations, timing_detail, warmup_iterations, copy_data);
}
auto perf_shape = to_perf_shape(f, perf_data); auto perf_shape = to_perf_shape(f, perf_data);
aggregate_perf_data.insert( aggregate_perf_data.insert(
aggregate_perf_data.end(), perf_shape.begin(), perf_shape.end()); aggregate_perf_data.end(), perf_shape.begin(), perf_shape.end());
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment