separate benchmark and pipelined benchmark

da7a15f8 · Robert Kimball · 8b768fee · da7a15f8 · da7a15f8 · da7a15f8
Commit da7a15f8 authored Jul 16, 2019 by Robert Kimball
8 changed files
--- a/src/tools/nbench/CMakeLists.txt
+++ b/src/tools/nbench/CMakeLists.txt
@@ -17,6 +17,8 @@
 set (SRC
    nbench.cpp
    benchmark.cpp
+    benchmark_pipelined.cpp
+    benchmark_utils.cpp
 )
 add_executable(nbench ${SRC})

--- a/src/tools/nbench/benchmark.cpp
+++ b/src/tools/nbench/benchmark.cpp
@@ -14,11 +14,6 @@
 // limitations under the License.
 //*****************************************************************************
-#include <random>
-#if defined(__x86_64__) || defined(__amd64__)
-#include <xmmintrin.h>
-#endif
 #include "benchmark.hpp"
 #include "ngraph/file_util.hpp"
 #include "ngraph/runtime/backend.hpp"
@@ -26,118 +21,11 @@
 #include "ngraph/runtime/tensor.hpp"
 #include "ngraph/serializer.hpp"
 #include "ngraph/util.hpp"
+#include "benchmark_utils.hpp"
 using namespace std;
 using namespace ngraph;
-static default_random_engine s_random_engine;
-void set_denormals_flush_to_zero()
-{
-#if defined(__x86_64__) || defined(__amd64__)
-    // Avoids perf impact from denormals while benchmarking with random data
-    _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
-    _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
-#endif
-}
-template <typename T>
-void init_int_tensor(shared_ptr<runtime::Tensor> tensor, T min, T max)
-{
-    size_t size = tensor->get_element_count();
-    uniform_int_distribution<T> dist(min, max);
-    vector<T> vec(size);
-    for (T& element : vec)
-    {
-        element = dist(s_random_engine);
-    }
-    tensor->write(vec.data(), vec.size() * sizeof(T));
-}
-template <>
-void init_int_tensor<char>(shared_ptr<runtime::Tensor> tensor, char min, char max)
-{
-    size_t size = tensor->get_element_count();
-    uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
-    vector<char> vec(size);
-    for (char& element : vec)
-    {
-        element = static_cast<char>(dist(s_random_engine));
-    }
-    tensor->write(vec.data(), vec.size() * sizeof(char));
-}
-template <>
-void init_int_tensor<int8_t>(shared_ptr<runtime::Tensor> tensor, int8_t min, int8_t max)
-{
-    size_t size = tensor->get_element_count();
-    uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
-    vector<int8_t> vec(size);
-    for (int8_t& element : vec)
-    {
-        element = static_cast<int8_t>(dist(s_random_engine));
-    }
-    tensor->write(vec.data(), vec.size() * sizeof(int8_t));
-}
-template <>
-void init_int_tensor<uint8_t>(shared_ptr<runtime::Tensor> tensor, uint8_t min, uint8_t max)
-{
-    size_t size = tensor->get_element_count();
-    uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
-    vector<uint8_t> vec(size);
-    for (uint8_t& element : vec)
-    {
-        element = static_cast<uint8_t>(dist(s_random_engine));
-    }
-    tensor->write(vec.data(), vec.size() * sizeof(uint8_t));
-}
-template <typename T>
-void init_real_tensor(shared_ptr<runtime::Tensor> tensor, T min, T max)
-{
-    size_t size = tensor->get_element_count();
-    uniform_real_distribution<T> dist(min, max);
-    vector<T> vec(size);
-    for (T& element : vec)
-    {
-        element = dist(s_random_engine);
-    }
-    tensor->write(vec.data(), vec.size() * sizeof(T));
-}
-static void random_init(shared_ptr<runtime::Tensor> tensor)
-{
-    element::Type et = tensor->get_element_type();
-#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
-#pragma GCC diagnostic push
-#pragma GCC diagnostic error "-Wswitch"
-#pragma GCC diagnostic error "-Wswitch-enum"
-#endif
-    switch (et.get_type_enum())
-    {
-    case element::Type_t::boolean: init_int_tensor<char>(tensor, 0, 1); break;
-    case element::Type_t::f32: init_real_tensor<float>(tensor, -1, 1); break;
-    case element::Type_t::f64: init_real_tensor<double>(tensor, -1, 1); break;
-    case element::Type_t::i8: init_int_tensor<int8_t>(tensor, -1, 1); break;
-    case element::Type_t::i16: init_int_tensor<int16_t>(tensor, -1, 1); break;
-    case element::Type_t::i32: init_int_tensor<int32_t>(tensor, 0, 1); break;
-    case element::Type_t::i64: init_int_tensor<int64_t>(tensor, 0, 1); break;
-    case element::Type_t::u8: init_int_tensor<uint8_t>(tensor, 0, 1); break;
-    case element::Type_t::u16: init_int_tensor<uint16_t>(tensor, 0, 1); break;
-    case element::Type_t::u32: init_int_tensor<uint32_t>(tensor, 0, 1); break;
-    case element::Type_t::u64: init_int_tensor<uint64_t>(tensor, 0, 1); break;
-    case element::Type_t::undefined:
-    case element::Type_t::dynamic:
-    case element::Type_t::bf16:
-    case element::Type_t::f16:
-    default: throw runtime_error("unsupported type");
-    }
-#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
-#pragma GCC diagnostic pop
-#endif
-}
 vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
                                                  const string& backend_name,
                                                  size_t iterations,
@@ -148,7 +36,7 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
    stopwatch timer;
    timer.start();
    auto backend = runtime::Backend::create(backend_name);
-    auto compiled_func = backend->compile(f, timing_detail);
+    auto exec = backend->compile(f, timing_detail);
    timer.stop();
    cout.imbue(locale(""));
    cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;
@@ -209,7 +97,7 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
                }
            }
        }
-        compiled_func->call(results, args);
+        exec->call(results, args);
        if (copy_data)
        {
            for (size_t result_index = 0; result_index < results.size(); result_index++)
@@ -225,120 +113,6 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
    float time = t1.get_milliseconds();
    cout << time / iterations << "ms per iteration" << endl;
-    vector<runtime::PerformanceCounter> perf_data = compiled_func->get_performance_data();
+    vector<runtime::PerformanceCounter> perf_data = exec->get_performance_data();
-    return perf_data;
-}
-vector<runtime::PerformanceCounter> run_benchmark_double_buffered(shared_ptr<Function> f,
-                                                                  const string& backend_name,
-                                                                  size_t iterations,
-                                                                  bool timing_detail,
-                                                                  int warmup_iterations,
-                                                                  bool copy_data)
-{
-    stopwatch timer;
-    timer.start();
-    auto backend = runtime::Backend::create(backend_name);
-    auto compiled_func = backend->compile(f, timing_detail);
-    timer.stop();
-    cout.imbue(locale(""));
-    cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;
-    set_denormals_flush_to_zero();
-    array<vector<shared_ptr<runtime::HostTensor>>, 2> args_data_set;
-    array<vector<shared_ptr<runtime::Tensor>>, 2> args_set;
-    array<vector<shared_ptr<runtime::HostTensor>>, 2> results_data_set;
-    array<vector<shared_ptr<runtime::Tensor>>, 2> results_set;
-    for (size_t i = 0; i < 2; i++)
-    {
-        vector<shared_ptr<runtime::HostTensor>> args_data;
-        vector<shared_ptr<runtime::Tensor>> args;
-        for (shared_ptr<op::Parameter> param : f->get_parameters())
-        {
-            auto tensor = backend->create_tensor(param->get_element_type(), param->get_shape());
-            auto tensor_data =
-                make_shared<runtime::HostTensor>(param->get_element_type(), param->get_shape());
-            random_init(tensor_data);
-            tensor->write(tensor_data->get_data_ptr(),
-                          tensor_data->get_element_count() *
-                              tensor_data->get_element_type().size());
-            args.push_back(tensor);
-            args_data.push_back(tensor_data);
-        }
-        args_set[i] = args;
-        args_data_set[i] = args_data;
-        vector<shared_ptr<runtime::Tensor>> results;
-        vector<shared_ptr<runtime::HostTensor>> results_data;
-        for (shared_ptr<Node> out : f->get_results())
-        {
-            auto result = backend->create_tensor(out->get_element_type(), out->get_shape());
-            auto result_data =
-                make_shared<runtime::HostTensor>(out->get_element_type(), out->get_shape());
-            results.push_back(result);
-            results_data.push_back(result_data);
-        }
-        results_set[i] = results;
-        results_data_set[i] = results_data;
-    }
-    stopwatch t1;
-    // Before we start we write the first iteration's data
-    size_t buffer_number = 0;
-    auto args = args_set[buffer_number];
-    auto args_data = args_data_set[buffer_number];
-    for (size_t arg_index = 0; arg_index < args.size(); arg_index++)
-    {
-        const shared_ptr<runtime::Tensor>& arg = args[arg_index];
-        const shared_ptr<runtime::HostTensor>& data = args_data[arg_index];
-        arg->begin_write(data->get_data_ptr(),
-                         data->get_element_count() * data->get_element_type().size(),
-                         buffer_number);
-    }
-    const vector<shared_ptr<runtime::Tensor>>& results = results_set[buffer_number];
-    const vector<shared_ptr<runtime::HostTensor>>& results_data = results_data_set[buffer_number];
-    for (size_t i = 0; i < iterations + warmup_iterations; i++)
-    {
-        if (i == warmup_iterations)
-        {
-            t1.start();
-        }
-        future<void> exec_future = compiled_func->begin_execute(results, args);
-        if (i > 0)
-        {
-            for (size_t result_index = 0; result_index < results.size(); result_index++)
-            {
-                const shared_ptr<runtime::HostTensor>& data = results_data[result_index];
-                const shared_ptr<runtime::Tensor>& result = results[result_index];
-                result->begin_read(data->get_data_ptr(),
-                                   data->get_element_count() * data->get_element_type().size(),
-                                   (buffer_number - 1) & 1);
-            }
-        }
-        buffer_number = (buffer_number + 1) & 1;
-        for (size_t arg_index = 0; arg_index < args.size(); arg_index++)
-        {
-            const shared_ptr<runtime::Tensor>& arg = args[arg_index];
-            const shared_ptr<runtime::HostTensor>& data = args_data[arg_index];
-            arg->begin_write(data->get_data_ptr(),
-                             data->get_element_count() * data->get_element_type().size(),
-                             buffer_number);
-        }
-        exec_future.get();
-    }
-    for (size_t result_index = 0; result_index < results.size(); result_index++)
-    {
-        const shared_ptr<runtime::HostTensor>& data = results_data[result_index];
-        const shared_ptr<runtime::Tensor>& result = results[result_index];
-        result->begin_read(data->get_data_ptr(),
-                           data->get_element_count() * data->get_element_type().size(),
-                           (buffer_number - 1) & 1);
-    }
-    t1.stop();
-    float time = t1.get_milliseconds();
-    cout << time / iterations << "ms per iteration" << endl;
-    vector<runtime::PerformanceCounter> perf_data = compiled_func->get_performance_data();
    return perf_data;
 }
--- a/src/tools/nbench/benchmark.hpp
+++ b/src/tools/nbench/benchmark.hpp
@@ -24,21 +24,9 @@
 #include "ngraph/function.hpp"
 #include "ngraph/runtime/performance_counter.hpp"
-/// performance test utilities
-std::multimap<size_t, std::string>
-    aggregate_timing(const std::vector<ngraph::runtime::PerformanceCounter>& perf_data);
 std::vector<ngraph::runtime::PerformanceCounter> run_benchmark(std::shared_ptr<ngraph::Function> f,
                                                               const std::string& backend_name,
                                                               size_t iterations,
                                                               bool timing_detail,
                                                               int warmup_iterations,
                                                               bool copy_data);
-std::vector<ngraph::runtime::PerformanceCounter>
-    run_benchmark_double_buffered(std::shared_ptr<ngraph::Function> f,
-                                  const std::string& backend_name,
-                                  size_t iterations,
-                                  bool timing_detail,
-                                  int warmup_iterations,
-                                  bool copy_data);
--- a/src/tools/nbench/benchmark_pipelined.cpp
+++ b/src/tools/nbench/benchmark_pipelined.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include "benchmark.hpp"
+#include "ngraph/file_util.hpp"
+#include "ngraph/runtime/backend.hpp"
+#include "ngraph/runtime/host_tensor.hpp"
+#include "ngraph/runtime/tensor.hpp"
+#include "ngraph/serializer.hpp"
+#include "ngraph/util.hpp"
+#include "benchmark_utils.hpp"
+using namespace std;
+using namespace ngraph;
+vector<runtime::PerformanceCounter> run_benchmark_pipelined(shared_ptr<Function> f,
+                                                                  const string& backend_name,
+                                                                  size_t iterations,
+                                                                  bool timing_detail,
+                                                                  int warmup_iterations,
+                                                                  bool copy_data)
+{
+    constexpr size_t pipeline_depth = 2;
+    stopwatch timer;
+    timer.start();
+    auto backend = runtime::Backend::create(backend_name);
+    auto exec = backend->compile(f, timing_detail);
+    timer.stop();
+    cout.imbue(locale(""));
+    cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;
+    set_denormals_flush_to_zero();
+    // Create random input data for all input tensors
+    array<vector<shared_ptr<runtime::HostTensor>>, pipeline_depth> parameters_data_set;
+    array<vector<shared_ptr<runtime::HostTensor>>, pipeline_depth> results_data_set;
+    for (size_t i = 0; i < pipeline_depth; i++)
+    {
+        vector<shared_ptr<runtime::HostTensor>> parameters_data;
+        for (shared_ptr<op::Parameter> param : f->get_parameters())
+        {
+            auto tensor_data =
+                make_shared<runtime::HostTensor>(param->get_element_type(), param->get_shape());
+            random_init(tensor_data);
+            parameters_data.push_back(tensor_data);
+        }
+        parameters_data_set[i] = parameters_data;
+    }
+    // Create input tensors for all Parameters
+    array<vector<shared_ptr<runtime::Tensor>>, pipeline_depth> input_tensors_array;
+    size_t input_index = 0;
+    for (shared_ptr<op::Parameter> param : f->get_parameters())
+    {
+        auto input_tensors = exec->create_input_tensor(input_index++, pipeline_depth);
+        for(size_t i=0; i<pipeline_depth; i++)
+        {
+            input_tensors_array[i].push_back(input_tensors[i]);
+        }
+    }
+    // // Create output tensors for all Results
+    // array<vector<shared_ptr<runtime::Tensor>>, pipeline_depth> output_tensors_array;
+    // for (shared_ptr<Node> out : f->get_results())
+    // {
+    //     auto output_tensors = backend->create_tensor(out->get_element_type(), out->get_shape());
+    //     output_tensors_array[i] = output_tensors;
+    // }
+    stopwatch t1;
+    // // Before we start we write the first iteration's data
+    // size_t buffer_number = 0;
+    // auto args = input_tensors_array[buffer_number];
+    // auto args_data = parameters_data_set[buffer_number];
+    // for (size_t arg_index = 0; arg_index < args.size(); arg_index++)
+    // {
+    //     const shared_ptr<runtime::Tensor>& arg = args[arg_index];
+    //     const shared_ptr<runtime::HostTensor>& data = args_data[arg_index];
+    //     arg->begin_write(data->get_data_ptr(),
+    //                      data->get_element_count() * data->get_element_type().size(),
+    //                      buffer_number);
+    // }
+    // const vector<shared_ptr<runtime::Tensor>>& results = output_tensors[buffer_number];
+    // const vector<shared_ptr<runtime::HostTensor>>& results_data = results_data_set[buffer_number];
+    // for (size_t i = 0; i < iterations + warmup_iterations; i++)
+    // {
+    //     if (i == warmup_iterations)
+    //     {
+    //         t1.start();
+    //     }
+    //     future<void> exec_future = exec->begin_execute(results, args);
+    //     if (i > 0)
+    //     {
+    //         for (size_t result_index = 0; result_index < results.size(); result_index++)
+    //         {
+    //             const shared_ptr<runtime::HostTensor>& data = results_data[result_index];
+    //             const shared_ptr<runtime::Tensor>& result = results[result_index];
+    //             result->begin_read(data->get_data_ptr(),
+    //                                data->get_element_count() * data->get_element_type().size(),
+    //                                (buffer_number - 1) & 1);
+    //         }
+    //     }
+    //     buffer_number = (buffer_number + 1) & 1;
+    //     for (size_t arg_index = 0; arg_index < args.size(); arg_index++)
+    //     {
+    //         const shared_ptr<runtime::Tensor>& arg = args[arg_index];
+    //         const shared_ptr<runtime::HostTensor>& data = args_data[arg_index];
+    //         arg->begin_write(data->get_data_ptr(),
+    //                          data->get_element_count() * data->get_element_type().size(),
+    //                          buffer_number);
+    //     }
+    //     exec_future.get();
+    // }
+    // for (size_t result_index = 0; result_index < results.size(); result_index++)
+    // {
+    //     const shared_ptr<runtime::HostTensor>& data = results_data[result_index];
+    //     const shared_ptr<runtime::Tensor>& result = results[result_index];
+    //     result->begin_read(data->get_data_ptr(),
+    //                        data->get_element_count() * data->get_element_type().size(),
+    //                        (buffer_number - 1) & 1);
+    // }
+    // t1.stop();
+    // float time = t1.get_milliseconds();
+    // cout << time / iterations << "ms per iteration" << endl;
+    vector<runtime::PerformanceCounter> perf_data = exec->get_performance_data();
+    return perf_data;
+}
--- a/src/tools/nbench/benchmark_pipelined.hpp
+++ b/src/tools/nbench/benchmark_pipelined.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "ngraph/function.hpp"
+#include "ngraph/runtime/performance_counter.hpp"
+std::vector<ngraph::runtime::PerformanceCounter>
+    run_benchmark_pipelined(std::shared_ptr<ngraph::Function> f,
+                                  const std::string& backend_name,
+                                  size_t iterations,
+                                  bool timing_detail,
+                                  int warmup_iterations,
+                                  bool copy_data);
--- a/src/tools/nbench/benchmark_utils.cpp
+++ b/src/tools/nbench/benchmark_utils.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#if defined(__x86_64__) || defined(__amd64__)
+#include <xmmintrin.h>
+#endif
+#include "benchmark_utils.hpp"
+#include "ngraph/file_util.hpp"
+#include "ngraph/runtime/backend.hpp"
+#include "ngraph/runtime/host_tensor.hpp"
+#include "ngraph/runtime/tensor.hpp"
+#include "ngraph/serializer.hpp"
+#include "ngraph/util.hpp"
+using namespace std;
+using namespace ngraph;
+template <>
+void init_int_tensor<char>(shared_ptr<runtime::Tensor> tensor, char min, char max)
+{
+    size_t size = tensor->get_element_count();
+    uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
+    vector<char> vec(size);
+    for (char& element : vec)
+    {
+        element = static_cast<char>(dist(get_random_engine()));
+    }
+    tensor->write(vec.data(), vec.size() * sizeof(char));
+}
+template <>
+void init_int_tensor<int8_t>(shared_ptr<runtime::Tensor> tensor, int8_t min, int8_t max)
+{
+    size_t size = tensor->get_element_count();
+    uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
+    vector<int8_t> vec(size);
+    for (int8_t& element : vec)
+    {
+        element = static_cast<int8_t>(dist(get_random_engine()));
+    }
+    tensor->write(vec.data(), vec.size() * sizeof(int8_t));
+}
+template <>
+void init_int_tensor<uint8_t>(shared_ptr<runtime::Tensor> tensor, uint8_t min, uint8_t max)
+{
+    size_t size = tensor->get_element_count();
+    uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
+    vector<uint8_t> vec(size);
+    for (uint8_t& element : vec)
+    {
+        element = static_cast<uint8_t>(dist(get_random_engine()));
+    }
+    tensor->write(vec.data(), vec.size() * sizeof(uint8_t));
+}
+void set_denormals_flush_to_zero()
+{
+#if defined(__x86_64__) || defined(__amd64__)
+    // Avoids perf impact from denormals while benchmarking with random data
+    _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+    _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+#endif
+}
+void random_init(shared_ptr<runtime::Tensor> tensor)
+{
+    element::Type et = tensor->get_element_type();
+#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic error "-Wswitch"
+#pragma GCC diagnostic error "-Wswitch-enum"
+#endif
+    switch (et.get_type_enum())
+    {
+    case element::Type_t::boolean: init_int_tensor<char>(tensor, 0, 1); break;
+    case element::Type_t::f32: init_real_tensor<float>(tensor, -1, 1); break;
+    case element::Type_t::f64: init_real_tensor<double>(tensor, -1, 1); break;
+    case element::Type_t::i8: init_int_tensor<int8_t>(tensor, -1, 1); break;
+    case element::Type_t::i16: init_int_tensor<int16_t>(tensor, -1, 1); break;
+    case element::Type_t::i32: init_int_tensor<int32_t>(tensor, 0, 1); break;
+    case element::Type_t::i64: init_int_tensor<int64_t>(tensor, 0, 1); break;
+    case element::Type_t::u8: init_int_tensor<uint8_t>(tensor, 0, 1); break;
+    case element::Type_t::u16: init_int_tensor<uint16_t>(tensor, 0, 1); break;
+    case element::Type_t::u32: init_int_tensor<uint32_t>(tensor, 0, 1); break;
+    case element::Type_t::u64: init_int_tensor<uint64_t>(tensor, 0, 1); break;
+    case element::Type_t::undefined:
+    case element::Type_t::dynamic:
+    case element::Type_t::bf16:
+    case element::Type_t::f16:
+    default: throw runtime_error("unsupported type");
+    }
+#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
+#pragma GCC diagnostic pop
+#endif
+}
+default_random_engine& get_random_engine()
+{
+    static std::default_random_engine s_random_engine;
+    return s_random_engine;
+}
--- a/src/tools/nbench/benchmark_utils.hpp
+++ b/src/tools/nbench/benchmark_utils.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include <random>
+#include "benchmark.hpp"
+#include "ngraph/file_util.hpp"
+#include "ngraph/runtime/backend.hpp"
+#include "ngraph/runtime/host_tensor.hpp"
+#include "ngraph/runtime/tensor.hpp"
+#include "ngraph/serializer.hpp"
+#include "ngraph/util.hpp"
+using namespace std;
+using namespace ngraph;
+void set_denormals_flush_to_zero();
+void random_init(shared_ptr<runtime::Tensor> tensor);
+std::default_random_engine& get_random_engine();
+template <typename T>
+void init_int_tensor(shared_ptr<runtime::Tensor> tensor, T min, T max)
+{
+    size_t size = tensor->get_element_count();
+    uniform_int_distribution<T> dist(min, max);
+    vector<T> vec(size);
+    for (T& element : vec)
+    {
+        element = dist(get_random_engine());
+    }
+    tensor->write(vec.data(), vec.size() * sizeof(T));
+}
+template <typename T>
+void init_real_tensor(shared_ptr<runtime::Tensor> tensor, T min, T max)
+{
+    size_t size = tensor->get_element_count();
+    uniform_real_distribution<T> dist(min, max);
+    vector<T> vec(size);
+    for (T& element : vec)
+    {
+        element = dist(get_random_engine());
+    }
+    tensor->write(vec.data(), vec.size() * sizeof(T));
+}
--- a/src/tools/nbench/nbench.cpp
+++ b/src/tools/nbench/nbench.cpp
@@ -24,6 +24,7 @@
 #include <iomanip>
 #include "benchmark.hpp"
+#include "benchmark_pipelined.hpp"
 #include "ngraph/distributed.hpp"
 #include "ngraph/except.hpp"
 #include "ngraph/file_util.hpp"
@@ -429,7 +430,7 @@ OPTIONS
                vector<runtime::PerformanceCounter> perf_data;
                if (double_buffer)
                {
-                    perf_data = run_benchmark_double_buffered(
+                    perf_data = run_benchmark_pipelined(
                        f, backend, iterations, timing_detail, warmup_iterations, copy_data);
                }
                else