Commit da7a15f8 authored by Robert Kimball's avatar Robert Kimball

separate benchmark and pipelined benchmark

parent 8b768fee
......@@ -17,6 +17,8 @@
set (SRC
nbench.cpp
benchmark.cpp
benchmark_pipelined.cpp
benchmark_utils.cpp
)
add_executable(nbench ${SRC})
......
This diff is collapsed.
......@@ -24,21 +24,9 @@
#include "ngraph/function.hpp"
#include "ngraph/runtime/performance_counter.hpp"
/// performance test utilities
std::multimap<size_t, std::string>
aggregate_timing(const std::vector<ngraph::runtime::PerformanceCounter>& perf_data);
std::vector<ngraph::runtime::PerformanceCounter> run_benchmark(std::shared_ptr<ngraph::Function> f,
const std::string& backend_name,
size_t iterations,
bool timing_detail,
int warmup_iterations,
bool copy_data);
std::vector<ngraph::runtime::PerformanceCounter>
run_benchmark_double_buffered(std::shared_ptr<ngraph::Function> f,
const std::string& backend_name,
size_t iterations,
bool timing_detail,
int warmup_iterations,
bool copy_data);
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include "benchmark.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/host_tensor.hpp"
#include "ngraph/runtime/tensor.hpp"
#include "ngraph/serializer.hpp"
#include "ngraph/util.hpp"
#include "benchmark_utils.hpp"
using namespace std;
using namespace ngraph;
vector<runtime::PerformanceCounter> run_benchmark_pipelined(shared_ptr<Function> f,
const string& backend_name,
size_t iterations,
bool timing_detail,
int warmup_iterations,
bool copy_data)
{
constexpr size_t pipeline_depth = 2;
stopwatch timer;
timer.start();
auto backend = runtime::Backend::create(backend_name);
auto exec = backend->compile(f, timing_detail);
timer.stop();
cout.imbue(locale(""));
cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;
set_denormals_flush_to_zero();
// Create random input data for all input tensors
array<vector<shared_ptr<runtime::HostTensor>>, pipeline_depth> parameters_data_set;
array<vector<shared_ptr<runtime::HostTensor>>, pipeline_depth> results_data_set;
for (size_t i = 0; i < pipeline_depth; i++)
{
vector<shared_ptr<runtime::HostTensor>> parameters_data;
for (shared_ptr<op::Parameter> param : f->get_parameters())
{
auto tensor_data =
make_shared<runtime::HostTensor>(param->get_element_type(), param->get_shape());
random_init(tensor_data);
parameters_data.push_back(tensor_data);
}
parameters_data_set[i] = parameters_data;
}
// Create input tensors for all Parameters
array<vector<shared_ptr<runtime::Tensor>>, pipeline_depth> input_tensors_array;
size_t input_index = 0;
for (shared_ptr<op::Parameter> param : f->get_parameters())
{
auto input_tensors = exec->create_input_tensor(input_index++, pipeline_depth);
for(size_t i=0; i<pipeline_depth; i++)
{
input_tensors_array[i].push_back(input_tensors[i]);
}
}
// // Create output tensors for all Results
// array<vector<shared_ptr<runtime::Tensor>>, pipeline_depth> output_tensors_array;
// for (shared_ptr<Node> out : f->get_results())
// {
// auto output_tensors = backend->create_tensor(out->get_element_type(), out->get_shape());
// output_tensors_array[i] = output_tensors;
// }
stopwatch t1;
// // Before we start we write the first iteration's data
// size_t buffer_number = 0;
// auto args = input_tensors_array[buffer_number];
// auto args_data = parameters_data_set[buffer_number];
// for (size_t arg_index = 0; arg_index < args.size(); arg_index++)
// {
// const shared_ptr<runtime::Tensor>& arg = args[arg_index];
// const shared_ptr<runtime::HostTensor>& data = args_data[arg_index];
// arg->begin_write(data->get_data_ptr(),
// data->get_element_count() * data->get_element_type().size(),
// buffer_number);
// }
// const vector<shared_ptr<runtime::Tensor>>& results = output_tensors[buffer_number];
// const vector<shared_ptr<runtime::HostTensor>>& results_data = results_data_set[buffer_number];
// for (size_t i = 0; i < iterations + warmup_iterations; i++)
// {
// if (i == warmup_iterations)
// {
// t1.start();
// }
// future<void> exec_future = exec->begin_execute(results, args);
// if (i > 0)
// {
// for (size_t result_index = 0; result_index < results.size(); result_index++)
// {
// const shared_ptr<runtime::HostTensor>& data = results_data[result_index];
// const shared_ptr<runtime::Tensor>& result = results[result_index];
// result->begin_read(data->get_data_ptr(),
// data->get_element_count() * data->get_element_type().size(),
// (buffer_number - 1) & 1);
// }
// }
// buffer_number = (buffer_number + 1) & 1;
// for (size_t arg_index = 0; arg_index < args.size(); arg_index++)
// {
// const shared_ptr<runtime::Tensor>& arg = args[arg_index];
// const shared_ptr<runtime::HostTensor>& data = args_data[arg_index];
// arg->begin_write(data->get_data_ptr(),
// data->get_element_count() * data->get_element_type().size(),
// buffer_number);
// }
// exec_future.get();
// }
// for (size_t result_index = 0; result_index < results.size(); result_index++)
// {
// const shared_ptr<runtime::HostTensor>& data = results_data[result_index];
// const shared_ptr<runtime::Tensor>& result = results[result_index];
// result->begin_read(data->get_data_ptr(),
// data->get_element_count() * data->get_element_type().size(),
// (buffer_number - 1) & 1);
// }
// t1.stop();
// float time = t1.get_milliseconds();
// cout << time / iterations << "ms per iteration" << endl;
vector<runtime::PerformanceCounter> perf_data = exec->get_performance_data();
return perf_data;
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "ngraph/function.hpp"
#include "ngraph/runtime/performance_counter.hpp"
std::vector<ngraph::runtime::PerformanceCounter>
run_benchmark_pipelined(std::shared_ptr<ngraph::Function> f,
const std::string& backend_name,
size_t iterations,
bool timing_detail,
int warmup_iterations,
bool copy_data);
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#if defined(__x86_64__) || defined(__amd64__)
#include <xmmintrin.h>
#endif
#include "benchmark_utils.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/host_tensor.hpp"
#include "ngraph/runtime/tensor.hpp"
#include "ngraph/serializer.hpp"
#include "ngraph/util.hpp"
using namespace std;
using namespace ngraph;
template <>
void init_int_tensor<char>(shared_ptr<runtime::Tensor> tensor, char min, char max)
{
size_t size = tensor->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<char> vec(size);
for (char& element : vec)
{
element = static_cast<char>(dist(get_random_engine()));
}
tensor->write(vec.data(), vec.size() * sizeof(char));
}
template <>
void init_int_tensor<int8_t>(shared_ptr<runtime::Tensor> tensor, int8_t min, int8_t max)
{
size_t size = tensor->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<int8_t> vec(size);
for (int8_t& element : vec)
{
element = static_cast<int8_t>(dist(get_random_engine()));
}
tensor->write(vec.data(), vec.size() * sizeof(int8_t));
}
template <>
void init_int_tensor<uint8_t>(shared_ptr<runtime::Tensor> tensor, uint8_t min, uint8_t max)
{
size_t size = tensor->get_element_count();
uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
vector<uint8_t> vec(size);
for (uint8_t& element : vec)
{
element = static_cast<uint8_t>(dist(get_random_engine()));
}
tensor->write(vec.data(), vec.size() * sizeof(uint8_t));
}
void set_denormals_flush_to_zero()
{
#if defined(__x86_64__) || defined(__amd64__)
// Avoids perf impact from denormals while benchmarking with random data
_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
#endif
}
void random_init(shared_ptr<runtime::Tensor> tensor)
{
element::Type et = tensor->get_element_type();
#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
#pragma GCC diagnostic push
#pragma GCC diagnostic error "-Wswitch"
#pragma GCC diagnostic error "-Wswitch-enum"
#endif
switch (et.get_type_enum())
{
case element::Type_t::boolean: init_int_tensor<char>(tensor, 0, 1); break;
case element::Type_t::f32: init_real_tensor<float>(tensor, -1, 1); break;
case element::Type_t::f64: init_real_tensor<double>(tensor, -1, 1); break;
case element::Type_t::i8: init_int_tensor<int8_t>(tensor, -1, 1); break;
case element::Type_t::i16: init_int_tensor<int16_t>(tensor, -1, 1); break;
case element::Type_t::i32: init_int_tensor<int32_t>(tensor, 0, 1); break;
case element::Type_t::i64: init_int_tensor<int64_t>(tensor, 0, 1); break;
case element::Type_t::u8: init_int_tensor<uint8_t>(tensor, 0, 1); break;
case element::Type_t::u16: init_int_tensor<uint16_t>(tensor, 0, 1); break;
case element::Type_t::u32: init_int_tensor<uint32_t>(tensor, 0, 1); break;
case element::Type_t::u64: init_int_tensor<uint64_t>(tensor, 0, 1); break;
case element::Type_t::undefined:
case element::Type_t::dynamic:
case element::Type_t::bf16:
case element::Type_t::f16:
default: throw runtime_error("unsupported type");
}
#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
#pragma GCC diagnostic pop
#endif
}
default_random_engine& get_random_engine()
{
static std::default_random_engine s_random_engine;
return s_random_engine;
}
//*****************************************************************************
// Copyright 2017-2019 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <random>
#include "benchmark.hpp"
#include "ngraph/file_util.hpp"
#include "ngraph/runtime/backend.hpp"
#include "ngraph/runtime/host_tensor.hpp"
#include "ngraph/runtime/tensor.hpp"
#include "ngraph/serializer.hpp"
#include "ngraph/util.hpp"
using namespace std;
using namespace ngraph;
void set_denormals_flush_to_zero();
void random_init(shared_ptr<runtime::Tensor> tensor);
std::default_random_engine& get_random_engine();
template <typename T>
void init_int_tensor(shared_ptr<runtime::Tensor> tensor, T min, T max)
{
size_t size = tensor->get_element_count();
uniform_int_distribution<T> dist(min, max);
vector<T> vec(size);
for (T& element : vec)
{
element = dist(get_random_engine());
}
tensor->write(vec.data(), vec.size() * sizeof(T));
}
template <typename T>
void init_real_tensor(shared_ptr<runtime::Tensor> tensor, T min, T max)
{
size_t size = tensor->get_element_count();
uniform_real_distribution<T> dist(min, max);
vector<T> vec(size);
for (T& element : vec)
{
element = dist(get_random_engine());
}
tensor->write(vec.data(), vec.size() * sizeof(T));
}
......@@ -24,6 +24,7 @@
#include <iomanip>
#include "benchmark.hpp"
#include "benchmark_pipelined.hpp"
#include "ngraph/distributed.hpp"
#include "ngraph/except.hpp"
#include "ngraph/file_util.hpp"
......@@ -429,7 +430,7 @@ OPTIONS
vector<runtime::PerformanceCounter> perf_data;
if (double_buffer)
{
perf_data = run_benchmark_double_buffered(
perf_data = run_benchmark_pipelined(
f, backend, iterations, timing_detail, warmup_iterations, copy_data);
}
else
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment