Commit 567bc822 authored by Robert Kimball's avatar Robert Kimball

nbench async option

parent 4b84262c
......@@ -196,23 +196,30 @@ void runtime::Backend::async_thread_stop()
}
}
static void local_thread_entry(shared_ptr<runtime::Backend::AsyncEvent> event)
{
event->get_executable()->call(event->get_outputs(), event->get_inputs());
event->signal_result();
};
void runtime::Backend::async_thread_process(const shared_ptr<AsyncEvent>& event)
{
switch (event->get_type())
{
case AsyncEvent::Type::READ:
event->get_tensor()->read(event->get_data(), 0, event->get_size_in_bytes());
event->get_tensor()->read(event->get_data(), event->get_size_in_bytes());
event->signal_result();
break;
case AsyncEvent::Type::WRITE:
event->get_tensor()->write(event->get_data(), 0, event->get_size_in_bytes());
event->get_tensor()->write(event->get_data(), event->get_size_in_bytes());
event->signal_result();
break;
case AsyncEvent::Type::EXECUTE:
event->get_executable()->call(event->get_outputs(), event->get_inputs());
event->signal_result();
{
std::thread(local_thread_entry, event).detach();
break;
}
}
}
void runtime::Backend::async_thread_entry()
......
......@@ -42,88 +42,100 @@ void set_denormals_flush_to_zero()
}
template <typename T>
void init_int_tv(shared_ptr<runtime::Tensor> tv, T min, T max)
void init_int_tensor(shared_ptr<runtime::Tensor> tensor, T min, T max)
{
size_t size = tv->get_element_count();
size_t size = tensor->get_element_count();
uniform_int_distribution<T> dist(min, max);
vector<T> vec(size);
for (T& element : vec)
{
element = dist(s_random_engine);
}
tv->write(vec.data(), vec.size() * sizeof(T));
tensor->write(vec.data(), vec.size() * sizeof(T));
}
template <>
void init_int_tv<char>(shared_ptr<runtime::Tensor> tv, char min, char max)
void init_int_tensor<char>(shared_ptr<runtime::Tensor> tensor, char min, char max)
{
size_t size = tv->get_element_count();
size_t size = tensor->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<char> vec(size);
for (char& element : vec)
{
element = static_cast<char>(dist(s_random_engine));
}
tv->write(vec.data(), vec.size() * sizeof(char));
tensor->write(vec.data(), vec.size() * sizeof(char));
}
template <>
void init_int_tv<int8_t>(shared_ptr<runtime::Tensor> tv, int8_t min, int8_t max)
void init_int_tensor<int8_t>(shared_ptr<runtime::Tensor> tensor, int8_t min, int8_t max)
{
size_t size = tv->get_element_count();
size_t size = tensor->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<int8_t> vec(size);
for (int8_t& element : vec)
{
element = static_cast<int8_t>(dist(s_random_engine));
}
tv->write(vec.data(), vec.size() * sizeof(int8_t));
tensor->write(vec.data(), vec.size() * sizeof(int8_t));
}
template <>
void init_int_tv<uint8_t>(shared_ptr<runtime::Tensor> tv, uint8_t min, uint8_t max)
void init_int_tensor<uint8_t>(shared_ptr<runtime::Tensor> tensor, uint8_t min, uint8_t max)
{
size_t size = tv->get_element_count();
size_t size = tensor->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<uint8_t> vec(size);
for (uint8_t& element : vec)
{
element = static_cast<uint8_t>(dist(s_random_engine));
}
tv->write(vec.data(), vec.size() * sizeof(uint8_t));
tensor->write(vec.data(), vec.size() * sizeof(uint8_t));
}
template <typename T>
void init_real_tv(shared_ptr<runtime::Tensor> tv, T min, T max)
void init_real_tensor(shared_ptr<runtime::Tensor> tensor, T min, T max)
{
size_t size = tv->get_element_count();
size_t size = tensor->get_element_count();
uniform_real_distribution<T> dist(min, max);
vector<T> vec(size);
for (T& element : vec)
{
element = dist(s_random_engine);
}
tv->write(vec.data(), vec.size() * sizeof(T));
tensor->write(vec.data(), vec.size() * sizeof(T));
}
static void random_init(shared_ptr<runtime::Tensor> tv)
static void random_init(shared_ptr<runtime::Tensor> tensor)
{
element::Type et = tv->get_element_type();
element::Type et = tensor->get_element_type();
#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
#pragma GCC diagnostic push
#pragma GCC diagnostic error "-Wswitch"
#pragma GCC diagnostic error "-Wswitch-enum"
#endif
switch (et.get_type_enum())
{
case element::Type_t::boolean: init_int_tv<char>(tv, 0, 1); break;
case element::Type_t::f32: init_real_tv<float>(tv, -1, 1); break;
case element::Type_t::f64: init_real_tv<double>(tv, -1, 1); break;
case element::Type_t::i8: init_int_tv<int8_t>(tv, -1, 1); break;
case element::Type_t::i16: init_int_tv<int16_t>(tv, -1, 1); break;
case element::Type_t::i32: init_int_tv<int32_t>(tv, 0, 1); break;
case element::Type_t::i64: init_int_tv<int64_t>(tv, 0, 1); break;
case element::Type_t::u8: init_int_tv<uint8_t>(tv, 0, 1); break;
case element::Type_t::u16: init_int_tv<uint16_t>(tv, 0, 1); break;
case element::Type_t::u32: init_int_tv<uint32_t>(tv, 0, 1); break;
case element::Type_t::u64: init_int_tv<uint64_t>(tv, 0, 1); break;
case element::Type_t::boolean: init_int_tensor<char>(tensor, 0, 1); break;
case element::Type_t::f32: init_real_tensor<float>(tensor, -1, 1); break;
case element::Type_t::f64: init_real_tensor<double>(tensor, -1, 1); break;
case element::Type_t::i8: init_int_tensor<int8_t>(tensor, -1, 1); break;
case element::Type_t::i16: init_int_tensor<int16_t>(tensor, -1, 1); break;
case element::Type_t::i32: init_int_tensor<int32_t>(tensor, 0, 1); break;
case element::Type_t::i64: init_int_tensor<int64_t>(tensor, 0, 1); break;
case element::Type_t::u8: init_int_tensor<uint8_t>(tensor, 0, 1); break;
case element::Type_t::u16: init_int_tensor<uint16_t>(tensor, 0, 1); break;
case element::Type_t::u32: init_int_tensor<uint32_t>(tensor, 0, 1); break;
case element::Type_t::u64: init_int_tensor<uint64_t>(tensor, 0, 1); break;
case element::Type_t::undefined:
case element::Type_t::dynamic:
case element::Type_t::bf16:
case element::Type_t::f16:
default: throw runtime_error("unsupported type");
}
#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
#pragma GCC diagnostic pop
#endif
}
vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
......@@ -216,3 +228,117 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
vector<runtime::PerformanceCounter> perf_data = compiled_func->get_performance_data();
return perf_data;
}
vector<runtime::PerformanceCounter> run_benchmark_double_buffered(shared_ptr<Function> f,
const string& backend_name,
size_t iterations,
bool timing_detail,
int warmup_iterations,
bool copy_data)
{
stopwatch timer;
timer.start();
auto backend = runtime::Backend::create(backend_name);
auto compiled_func = backend->compile(f, timing_detail);
timer.stop();
cout.imbue(locale(""));
cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;
set_denormals_flush_to_zero();
array<vector<shared_ptr<runtime::HostTensor>>, 2> args_data_set;
array<vector<shared_ptr<runtime::Tensor>>, 2> args_set;
array<vector<shared_ptr<runtime::HostTensor>>, 2> results_data_set;
array<vector<shared_ptr<runtime::Tensor>>, 2> results_set;
for (size_t i = 0; i < 2; i++)
{
vector<shared_ptr<runtime::HostTensor>> args_data;
vector<shared_ptr<runtime::Tensor>> args;
for (shared_ptr<op::Parameter> param : f->get_parameters())
{
auto tensor = backend->create_tensor(param->get_element_type(), param->get_shape());
auto tensor_data =
make_shared<runtime::HostTensor>(param->get_element_type(), param->get_shape());
random_init(tensor_data);
tensor->write(tensor_data->get_data_ptr(),
tensor_data->get_element_count() *
tensor_data->get_element_type().size());
args.push_back(tensor);
args_data.push_back(tensor_data);
}
args_set[i] = args;
args_data_set[i] = args_data;
vector<shared_ptr<runtime::Tensor>> results;
vector<shared_ptr<runtime::HostTensor>> results_data;
for (shared_ptr<Node> out : f->get_results())
{
auto result = backend->create_tensor(out->get_element_type(), out->get_shape());
auto result_data =
make_shared<runtime::HostTensor>(out->get_element_type(), out->get_shape());
results.push_back(result);
results_data.push_back(result_data);
}
results_set[i] = results;
results_data_set[i] = results_data;
}
stopwatch t1;
// Before we start we write the first iteration's data
size_t buffer_number = 0;
auto args = args_set[buffer_number];
auto args_data = args_data_set[buffer_number];
for (size_t arg_index = 0; arg_index < args.size(); arg_index++)
{
const shared_ptr<runtime::Tensor>& arg = args[arg_index];
const shared_ptr<runtime::HostTensor>& data = args_data[arg_index];
arg->begin_write(data->get_data_ptr(),
data->get_element_count() * data->get_element_type().size(),
buffer_number);
}
const vector<shared_ptr<runtime::Tensor>>& results = results_set[buffer_number];
const vector<shared_ptr<runtime::HostTensor>>& results_data = results_data_set[buffer_number];
for (size_t i = 0; i < iterations + warmup_iterations; i++)
{
if (i == warmup_iterations)
{
t1.start();
}
future<void> exec_future = compiled_func->begin_execute(results, args);
if (i > 0)
{
for (size_t result_index = 0; result_index < results.size(); result_index++)
{
const shared_ptr<runtime::HostTensor>& data = results_data[result_index];
const shared_ptr<runtime::Tensor>& result = results[result_index];
result->begin_read(data->get_data_ptr(),
data->get_element_count() * data->get_element_type().size(),
(buffer_number - 1) & 1);
}
}
buffer_number = (buffer_number + 1) & 1;
for (size_t arg_index = 0; arg_index < args.size(); arg_index++)
{
const shared_ptr<runtime::Tensor>& arg = args[arg_index];
const shared_ptr<runtime::HostTensor>& data = args_data[arg_index];
arg->begin_write(data->get_data_ptr(),
data->get_element_count() * data->get_element_type().size(),
buffer_number);
}
exec_future.get();
}
for (size_t result_index = 0; result_index < results.size(); result_index++)
{
const shared_ptr<runtime::HostTensor>& data = results_data[result_index];
const shared_ptr<runtime::Tensor>& result = results[result_index];
result->begin_read(data->get_data_ptr(),
data->get_element_count() * data->get_element_type().size(),
(buffer_number - 1) & 1);
}
t1.stop();
float time = t1.get_milliseconds();
cout << time / iterations << "ms per iteration" << endl;
vector<runtime::PerformanceCounter> perf_data = compiled_func->get_performance_data();
return perf_data;
}
......@@ -34,3 +34,11 @@ std::vector<ngraph::runtime::PerformanceCounter> run_benchmark(std::shared_ptr<n
bool timing_detail,
int warmup_iterations,
bool copy_data);
std::vector<ngraph::runtime::PerformanceCounter>
run_benchmark_double_buffered(std::shared_ptr<ngraph::Function> f,
const std::string& backend_name,
size_t iterations,
bool timing_detail,
int warmup_iterations,
bool copy_data);
......@@ -181,6 +181,7 @@ int main(int argc, char** argv)
int warmup_iterations = 1;
bool copy_data = true;
bool dot_file = false;
bool double_buffer = false;
for (size_t i = 1; i < argc; i++)
{
......@@ -229,6 +230,10 @@ int main(int argc, char** argv)
{
directory = argv[++i];
}
else if (arg == "--double_buffer")
{
double_buffer = true;
}
else if (arg == "-w" || arg == "--warmup_iterations")
{
try
......@@ -283,6 +288,7 @@ OPTIONS
-w|--warmup_iterations Number of warm-up iterations
--no_copy_data Disable copy of input/result data every iteration
--dot Generate Graphviz dot file
--double_buffer Double buffer inputs and outputs
)###";
return 1;
}
......@@ -420,8 +426,17 @@ OPTIONS
{
cout << "\n---- Benchmark ----\n";
shared_ptr<Function> f = deserialize(model);
auto perf_data = run_benchmark(
f, backend, iterations, timing_detail, warmup_iterations, copy_data);
vector<runtime::PerformanceCounter> perf_data;
if (double_buffer)
{
perf_data = run_benchmark_double_buffered(
f, backend, iterations, timing_detail, warmup_iterations, copy_data);
}
else
{
perf_data = run_benchmark(
f, backend, iterations, timing_detail, warmup_iterations, copy_data);
}
auto perf_shape = to_perf_shape(f, perf_data);
aggregate_perf_data.insert(
aggregate_perf_data.end(), perf_shape.begin(), perf_shape.end());
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment