Merge pull request #3284 from NervanaSystems/bob/nbench_db

Add nbench support for double buffering

Merge pull request #3284 from NervanaSystems/bob/nbench_db
Add nbench support for double buffering
d34fb157 · Robert Kimball · GitHub · a509de7b · 47626835 · d34fb157
Unverified Commit d34fb157 authored Jul 24, 2019 by Robert Kimball Committed by GitHub Jul 24, 2019
14 changed files
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -463,6 +463,8 @@ set (SRC
    runtime/backend.hpp
    runtime/backend_manager.cpp
    runtime/backend_manager.hpp
+    runtime/chrome_trace.cpp
+    runtime/chrome_trace.hpp
    runtime/executable.cpp
    runtime/executable.hpp
    runtime/host_tensor.cpp

--- a/src/ngraph/runtime/chrome_trace.cpp
+++ b/src/ngraph/runtime/chrome_trace.cpp
+//*****************************************************************************
+// Copyright 2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <string>
+
+#include "chrome_trace.hpp"
+#include "ngraph/log.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+static bool read_tracing_env_var()
+{
+    static const bool is_enabled = (getenv("NGRAPH_ENABLE_TRACING") != nullptr);
+
+    return is_enabled;
+}
+
+mutex runtime::event::Manager::s_file_mutex;
+bool runtime::event::Manager::s_tracing_enabled = read_tracing_env_var();
+
+runtime::event::Duration::Duration(const string& name, const string& category, const string& args)
+{
+    if (Manager::is_tracing_enabled())
+    {
+        m_start = Manager::get_current_microseconds();
+        m_stop = 0;
+        m_name = name;
+        m_category = category;
+        m_args = args;
+    }
+}
+
+void runtime::event::Duration::stop()
+{
+    if (Manager::is_tracing_enabled())
+    {
+        m_stop = Manager::get_current_microseconds();
+    }
+}
+
+void runtime::event::Duration::write()
+{
+    if (Manager::is_tracing_enabled())
+    {
+        size_t stop_time = (m_stop != 0 ? m_stop : Manager::get_current_microseconds());
+
+        lock_guard<mutex> lock(Manager::get_mutex());
+
+        ofstream& out = runtime::event::Manager::get_output_stream();
+        if (out.is_open() == false)
+        {
+            runtime::event::Manager::open();
+        }
+        else
+        {
+            Manager::get_output_stream() << ",\n";
+        }
+
+        Manager::get_output_stream() <<
+            R"({"name":")" << m_name << R"(","cat":")" << m_category << R"(","ph":"X","pid":)"
+                                     << Manager::get_process_id() << R"(,"tid":)"
+                                     << Manager::get_thread_id() <<
+            R"(,"ts":)" << m_start << R"(,"dur":)" << (stop_time - m_start);
+        if (!m_args.empty())
+        {
+            out <<
+                R"(,"args":)" << m_args;
+        }
+        out << "}";
+    }
+}
+
+runtime::event::Object::Object(const string& name, const string& args)
+    : m_name{name}
+    , m_id{static_cast<size_t>(chrono::high_resolution_clock::now().time_since_epoch().count())}
+{
+    if (Manager::is_tracing_enabled())
+    {
+        lock_guard<mutex> lock(Manager::get_mutex());
+
+        ofstream& out = runtime::event::Manager::get_output_stream();
+        if (out.is_open() == false)
+        {
+            runtime::event::Manager::open();
+        }
+        else
+        {
+            Manager::get_output_stream() << ",\n";
+        }
+        out << R"({"name":")" << m_name << R"(","ph":"N","id":")" << m_id <<
+            R"(","ts":)" << Manager::get_current_microseconds() <<
+            R"(,"pid":)" << Manager::get_process_id() << R"(,"tid":)" << Manager::get_thread_id();
+        if (!args.empty())
+        {
+            out <<
+                R"(,"args":)" << args;
+        }
+        out << "}";
+
+        write_snapshot(out, args);
+    }
+}
+
+void runtime::event::Object::snapshot(const string& args)
+{
+    if (Manager::is_tracing_enabled())
+    {
+        lock_guard<mutex> lock(Manager::get_mutex());
+
+        ofstream& out = runtime::event::Manager::get_output_stream();
+        if (out.is_open() == false)
+        {
+            runtime::event::Manager::open();
+        }
+        else
+        {
+            Manager::get_output_stream() << ",\n";
+        }
+        write_snapshot(out, args);
+    }
+}
+
+void runtime::event::Object::write_snapshot(ostream& out, const string& args)
+{
+    out << R"({"name":")" << m_name << R"(","ph":"O","id":")" << m_id <<
+        R"(","ts":)" << Manager::get_current_microseconds() <<
+        R"(,"pid":)" << Manager::get_process_id() << R"(,"tid":)" << Manager::get_thread_id();
+    if (!args.empty())
+    {
+        out <<
+            R"(,"args":)" << args;
+    }
+    out << "}";
+}
+
+void runtime::event::Object::destroy()
+{
+    if (Manager::is_tracing_enabled())
+    {
+        lock_guard<mutex> lock(Manager::get_mutex());
+
+        ofstream& out = runtime::event::Manager::get_output_stream();
+        if (out.is_open() == false)
+        {
+            runtime::event::Manager::open();
+        }
+        else
+        {
+            Manager::get_output_stream() << ",\n";
+        }
+        out << R"({"name":")" << m_name << R"(","ph":"D","id":")" << m_id <<
+            R"(","ts":)" << Manager::get_current_microseconds() <<
+            R"(,"pid":)" << Manager::get_process_id() << R"(,"tid":)" << Manager::get_thread_id()
+            << "}";
+    }
+}
+
+void runtime::event::Manager::open(const string& path)
+{
+    ofstream& out = get_output_stream();
+    if (out.is_open() == false)
+    {
+        out.open(path, ios_base::trunc);
+        out << "[\n";
+    }
+}
+
+void runtime::event::Manager::close()
+{
+    ofstream& out = get_output_stream();
+    if (out.is_open())
+    {
+        out << "\n]\n";
+        out.close();
+    }
+}
+
+ofstream& runtime::event::Manager::get_output_stream()
+{
+    static ofstream s_event_log;
+    return s_event_log;
+}
+
+const string& runtime::event::Manager::get_process_id()
+{
+    static const string s_pid = to_string(getpid());
+    return s_pid;
+}
+
+void runtime::event::Manager::enable_event_tracing()
+{
+    s_tracing_enabled = true;
+}
+
+void runtime::event::Manager::disable_event_tracing()
+{
+    s_tracing_enabled = false;
+}
+
+bool runtime::event::Manager::is_event_tracing_enabled()
+{
+    return s_tracing_enabled;
+}
+
+string runtime::event::Manager::get_thread_id()
+{
+    thread::id tid = this_thread::get_id();
+    static map<thread::id, string> tid_map;
+    auto it = tid_map.find(tid);
+    string rc;
+    if (it == tid_map.end())
+    {
+        stringstream ss;
+        ss << "\"" << tid << "\"";
+        rc = ss.str();
+        tid_map.insert({tid, rc});
+    }
+    else
+    {
+        rc = it->second;
+    }
+    return rc;
+}
--- a/src/ngraph/runtime/chrome_trace.hpp
+++ b/src/ngraph/runtime/chrome_trace.hpp
+//*****************************************************************************
+// Copyright 2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#pragma once
+
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <mutex>
+#include <string>
+#include <thread>
+#ifdef _WIN32
+#include <windows.h>
+// windows.h must be before processthreadsapi.h so we need this comment
+#include <processthreadsapi.h>
+#define getpid() GetCurrentProcessId()
+#else
+#include <unistd.h>
+#endif
+
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace event
+        {
+            class Duration;
+            class Object;
+            class Manager;
+        }
+    }
+}
+
+//
+// This class records timestamps for a given user defined event and
+// produces output in the chrome tracing format that can be used to view
+// the events of a running program
+//
+// Following is the format of a trace event
+//
+// {
+//   "name": "myName",
+//   "cat": "category,list",
+//   "ph": "B",
+//   "ts": 12345,
+//   "pid": 123,
+//   "tid": 456,
+//   "args": {
+//     "someArg": 1,
+//     "anotherArg": {
+//       "value": "my value"
+//     }
+//   }
+// }
+//
+// The trace file format is defined here:
+// https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview
+//
+// The trace file can be viewed by Chrome browser using the
+// URL: chrome://tracing/
+//
+// More information about this is at:
+// http://dev.chromium.org/developers/how-tos/trace-event-profiling-tool
+
+class ngraph::runtime::event::Manager
+{
+    friend class Duration;
+    friend class Object;
+
+public:
+    static void open(const std::string& path = "runtime_event_trace.json");
+    static void close();
+    static bool is_tracing_enabled() { return s_tracing_enabled; }
+    static void enable_event_tracing();
+    static void disable_event_tracing();
+    static bool is_event_tracing_enabled();
+
+private:
+    static std::ofstream& get_output_stream();
+    static const std::string& get_process_id();
+    static size_t get_current_microseconds()
+    {
+        return std::chrono::high_resolution_clock::now().time_since_epoch().count() / 1000;
+    }
+    static std::string get_thread_id();
+    static std::mutex& get_mutex() { return s_file_mutex; }
+    static std::ostream s_ostream;
+    static std::mutex s_file_mutex;
+    static bool s_tracing_enabled;
+};
+
+class ngraph::runtime::event::Duration
+{
+public:
+    explicit Duration(const std::string& name,
+                      const std::string& category,
+                      const std::string& args = "");
+    ~Duration() { write(); }
+    /// \brief stop the timer without writing the data to the log file. To write the data
+    /// call the `write` method
+    /// Calls to stop() are optional
+    void stop();
+
+    /// \brief write the log data to the log file for this event
+    /// This funtion has an implicit stop() if stop() has not been previously called
+    void write();
+
+    Duration(const Duration&) = delete;
+    Duration& operator=(Duration const&) = delete;
+
+private:
+    std::string to_json() const;
+    size_t m_start;
+    size_t m_stop;
+    std::string m_name;
+    std::string m_category;
+    std::string m_args;
+};
+
+class ngraph::runtime::event::Object
+{
+public:
+    Object(const std::string& name, const std::string& args);
+    void snapshot(const std::string& args);
+    void destroy();
+
+private:
+    void write_snapshot(std::ostream& out, const std::string& args);
+    const std::string m_name;
+    size_t m_id;
+};
--- a/src/ngraph/runtime/host_tensor.cpp
+++ b/src/ngraph/runtime/host_tensor.cpp
@@ -18,6 +18,7 @@
 #include <memory>

 #include "ngraph/descriptor/layout/dense_tensor_layout.hpp"
+#include "ngraph/runtime/chrome_trace.hpp"
 #include "ngraph/runtime/host_tensor.hpp"
 #include "ngraph/util.hpp"

@@ -96,6 +97,8 @@ const char* runtime::HostTensor::get_data_ptr() const

 void runtime::HostTensor::write(const void* source, size_t n)
 {
+    runtime::event::Duration d1("write", "HostTensor");
+
    if (n > m_buffer_size)
    {
        throw out_of_range("write access past end of tensor");
@@ -106,6 +109,7 @@ void runtime::HostTensor::write(const void* source, size_t n)

 void runtime::HostTensor::read(void* target, size_t n) const
 {
+    runtime::event::Duration d1("read", "HostTensor");
    if (n > m_buffer_size)
    {
        throw out_of_range("read access past end of tensor");

--- a/src/ngraph/runtime/interpreter/int_executable.cpp
+++ b/src/ngraph/runtime/interpreter/int_executable.cpp
@@ -30,6 +30,7 @@
 #include "ngraph/pass/manager.hpp"
 #include "ngraph/pass/memory_layout.hpp"
 #include "ngraph/runtime/backend_manager.hpp"
+#include "ngraph/runtime/chrome_trace.hpp"
 #include "ngraph/serializer.hpp"
 #include "ngraph/util.hpp"

@@ -74,6 +75,8 @@ runtime::interpreter::INTExecutable::INTExecutable(const std::string& model_stri
 bool runtime::interpreter::INTExecutable::call(const vector<shared_ptr<runtime::Tensor>>& outputs,
                                               const vector<shared_ptr<runtime::Tensor>>& inputs)
 {
+    runtime::event::Duration d1("call", "Interpreter");
+
    // convert inputs to HostTensor
    vector<shared_ptr<HostTensor>> func_inputs;
    for (auto tensor : inputs)
@@ -122,6 +125,7 @@ bool runtime::interpreter::INTExecutable::call(const vector<shared_ptr<runtime::
    for (const NodeWrapper& wrapped : m_wrapped_nodes)
    {
        auto op = wrapped.get_node();
+        runtime::event::Duration d2(op->description(), "Interpreter");
        auto type_id = wrapped.get_typeid();
        if (type_id == OP_TYPEID::Parameter)
        {
@@ -304,3 +308,74 @@ void runtime::interpreter::INTExecutable::save(ostream& out)
    string model = serialize(m_function, 0);
    writer.write("model", model.data(), model.size());
 }
+
+shared_ptr<ngraph::op::Parameter>
+    runtime::interpreter::INTExecutable::get_parameter(size_t index) const
+{
+    const ParameterVector& parameters = get_parameters();
+    NGRAPH_CHECK(index < parameters.size(), "create_tensor for input out of bounds");
+    return parameters[index];
+}
+
+shared_ptr<ngraph::op::Result> runtime::interpreter::INTExecutable::get_result(size_t index) const
+{
+    const ResultVector& results = get_results();
+    NGRAPH_CHECK(index < results.size(), "create_tensor for input out of bounds");
+    return results[index];
+}
+shared_ptr<runtime::Tensor>
+    runtime::interpreter::INTExecutable::create_input_tensor(size_t input_index)
+{
+    shared_ptr<op::Parameter> parameter = get_parameter(input_index);
+    return make_shared<runtime::HostTensor>(parameter->get_element_type(), parameter->get_shape());
+}
+
+shared_ptr<runtime::Tensor>
+    runtime::interpreter::INTExecutable::create_output_tensor(size_t output_index)
+{
+    shared_ptr<op::Result> result = get_result(output_index);
+    return make_shared<runtime::HostTensor>(result->get_element_type(), result->get_shape());
+}
+
+vector<shared_ptr<runtime::Tensor>>
+    runtime::interpreter::INTExecutable::create_input_tensor(size_t input_index,
+                                                             size_t pipeline_depth)
+{
+    vector<shared_ptr<runtime::HostTensor>> tensors;
+    shared_ptr<op::Parameter> parameter = get_parameter(input_index);
+    for (size_t i = 0; i < pipeline_depth; i++)
+    {
+        shared_ptr<runtime::HostTensor> tensor;
+        auto t =
+            make_shared<runtime::HostTensor>(parameter->get_element_type(), parameter->get_shape());
+        tensor = static_pointer_cast<runtime::HostTensor>(t);
+        tensors.push_back(tensor);
+    }
+    vector<shared_ptr<runtime::Tensor>> result_tensors;
+    for (const shared_ptr<runtime::HostTensor>& tensor : tensors)
+    {
+        result_tensors.push_back(tensor);
+    }
+    return result_tensors;
+}
+
+vector<shared_ptr<runtime::Tensor>>
+    runtime::interpreter::INTExecutable::create_output_tensor(size_t output_index,
+                                                              size_t pipeline_depth)
+{
+    vector<shared_ptr<runtime::HostTensor>> tensors;
+    shared_ptr<op::Result> result = get_result(output_index);
+    for (size_t i = 0; i < pipeline_depth; i++)
+    {
+        shared_ptr<runtime::HostTensor> tensor;
+        auto t = make_shared<runtime::HostTensor>(result->get_element_type(), result->get_shape());
+        tensor = static_pointer_cast<runtime::HostTensor>(t);
+        tensors.push_back(tensor);
+    }
+    vector<shared_ptr<runtime::Tensor>> result_tensors;
+    for (const shared_ptr<runtime::HostTensor>& tensor : tensors)
+    {
+        result_tensors.push_back(tensor);
+    }
+    return result_tensors;
+}
--- a/src/ngraph/runtime/interpreter/int_executable.hpp
+++ b/src/ngraph/runtime/interpreter/int_executable.hpp
@@ -186,9 +186,21 @@ public:

    std::vector<PerformanceCounter> get_performance_data() const override;

+    std::shared_ptr<runtime::Tensor> create_input_tensor(size_t input_index) override;
+
+    std::shared_ptr<runtime::Tensor> create_output_tensor(size_t output_index) override;
+
+    std::vector<std::shared_ptr<runtime::Tensor>>
+        create_input_tensor(size_t input_index, size_t pipeline_depth) override;
+
+    std::vector<std::shared_ptr<runtime::Tensor>>
+        create_output_tensor(size_t output_index, size_t pipeline_depth) override;
+
 private:
    INTExecutable(const std::string& model_string);

+    std::shared_ptr<ngraph::op::Parameter> get_parameter(size_t index) const;
+    std::shared_ptr<ngraph::op::Result> get_result(size_t index) const;
    int get_alignment() const { return 64; }
    bool m_is_compiled = false;
    bool m_nan_check_enabled = false;

--- a/src/tools/nbench/CMakeLists.txt
+++ b/src/tools/nbench/CMakeLists.txt
@@ -17,6 +17,8 @@
 set (SRC
    nbench.cpp
    benchmark.cpp
+    benchmark_pipelined.cpp
+    benchmark_utils.cpp
 )

 add_executable(nbench ${SRC})

--- a/src/tools/nbench/benchmark.cpp
+++ b/src/tools/nbench/benchmark.cpp
@@ -14,12 +14,8 @@
 // limitations under the License.
 //*****************************************************************************

-#include <random>
-#if defined(__x86_64__) || defined(__amd64__)
-#include <xmmintrin.h>
-#endif
-
 #include "benchmark.hpp"
+#include "benchmark_utils.hpp"
 #include "ngraph/file_util.hpp"
 #include "ngraph/runtime/backend.hpp"
 #include "ngraph/runtime/host_tensor.hpp"
@@ -30,106 +26,6 @@
 using namespace std;
 using namespace ngraph;

-static default_random_engine s_random_engine;
-
-void set_denormals_flush_to_zero()
-{
-#if defined(__x86_64__) || defined(__amd64__)
-    // Avoids perf impact from denormals while benchmarking with random data
-    _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
-    _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
-#endif
-}
-
-template <typename T>
-void init_int_tv(shared_ptr<runtime::Tensor> tv, T min, T max)
-{
-    size_t size = tv->get_element_count();
-    uniform_int_distribution<T> dist(min, max);
-    vector<T> vec(size);
-    for (T& element : vec)
-    {
-        element = dist(s_random_engine);
-    }
-    tv->write(vec.data(), vec.size() * sizeof(T));
-}
-
-template <>
-void init_int_tv<char>(shared_ptr<runtime::Tensor> tv, char min, char max)
-{
-    size_t size = tv->get_element_count();
-    uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
-    vector<char> vec(size);
-    for (char& element : vec)
-    {
-        element = static_cast<char>(dist(s_random_engine));
-    }
-    tv->write(vec.data(), vec.size() * sizeof(char));
-}
-
-template <>
-void init_int_tv<int8_t>(shared_ptr<runtime::Tensor> tv, int8_t min, int8_t max)
-{
-    size_t size = tv->get_element_count();
-    uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
-    vector<int8_t> vec(size);
-    for (int8_t& element : vec)
-    {
-        element = static_cast<int8_t>(dist(s_random_engine));
-    }
-    tv->write(vec.data(), vec.size() * sizeof(int8_t));
-}
-
-template <>
-void init_int_tv<uint8_t>(shared_ptr<runtime::Tensor> tv, uint8_t min, uint8_t max)
-{
-    size_t size = tv->get_element_count();
-    uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
-    vector<uint8_t> vec(size);
-    for (uint8_t& element : vec)
-    {
-        element = static_cast<uint8_t>(dist(s_random_engine));
-    }
-    tv->write(vec.data(), vec.size() * sizeof(uint8_t));
-}
-
-template <typename T>
-void init_real_tv(shared_ptr<runtime::Tensor> tv, T min, T max)
-{
-    size_t size = tv->get_element_count();
-    uniform_real_distribution<T> dist(min, max);
-    vector<T> vec(size);
-    for (T& element : vec)
-    {
-        element = dist(s_random_engine);
-    }
-    tv->write(vec.data(), vec.size() * sizeof(T));
-}
-
-static void random_init(shared_ptr<runtime::Tensor> tv)
-{
-    element::Type et = tv->get_element_type();
-    switch (et.get_type_enum())
-    {
-    case element::Type_t::boolean: init_int_tv<char>(tv, 0, 1); break;
-    case element::Type_t::f32: init_real_tv<float>(tv, -1, 1); break;
-    case element::Type_t::f64: init_real_tv<double>(tv, -1, 1); break;
-    case element::Type_t::i8: init_int_tv<int8_t>(tv, -1, 1); break;
-    case element::Type_t::i16: init_int_tv<int16_t>(tv, -1, 1); break;
-    case element::Type_t::i32: init_int_tv<int32_t>(tv, 0, 1); break;
-    case element::Type_t::i64: init_int_tv<int64_t>(tv, 0, 1); break;
-    case element::Type_t::u8: init_int_tv<uint8_t>(tv, 0, 1); break;
-    case element::Type_t::u16: init_int_tv<uint16_t>(tv, 0, 1); break;
-    case element::Type_t::u32: init_int_tv<uint32_t>(tv, 0, 1); break;
-    case element::Type_t::u64: init_int_tv<uint64_t>(tv, 0, 1); break;
-    case element::Type_t::undefined:
-    case element::Type_t::dynamic:
-    case element::Type_t::bf16:
-    case element::Type_t::f16:
-    default: throw runtime_error("unsupported type");
-    }
-}
-
 vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
                                                  const string& backend_name,
                                                  size_t iterations,
@@ -140,7 +36,7 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
    stopwatch timer;
    timer.start();
    auto backend = runtime::Backend::create(backend_name);
-    auto compiled_func = backend->compile(f, timing_detail);
+    auto exec = backend->compile(f, timing_detail);
    timer.stop();
    cout.imbue(locale(""));
    cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;
@@ -201,7 +97,7 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
                }
            }
        }
-        compiled_func->call(results, args);
+        exec->call(results, args);
        if (copy_data)
        {
            for (size_t result_index = 0; result_index < results.size(); result_index++)
@@ -217,6 +113,6 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
    float time = t1.get_milliseconds();
    cout << time / iterations << "ms per iteration" << endl;

-    vector<runtime::PerformanceCounter> perf_data = compiled_func->get_performance_data();
+    vector<runtime::PerformanceCounter> perf_data = exec->get_performance_data();
    return perf_data;
 }
--- a/src/tools/nbench/benchmark.hpp
+++ b/src/tools/nbench/benchmark.hpp
@@ -24,10 +24,6 @@
 #include "ngraph/function.hpp"
 #include "ngraph/runtime/performance_counter.hpp"

-/// performance test utilities
-std::multimap<size_t, std::string>
-    aggregate_timing(const std::vector<ngraph::runtime::PerformanceCounter>& perf_data);
-
 std::vector<ngraph::runtime::PerformanceCounter> run_benchmark(std::shared_ptr<ngraph::Function> f,
                                                               const std::string& backend_name,
                                                               size_t iterations,

--- a/src/tools/nbench/benchmark_pipelined.cpp
+++ b/src/tools/nbench/benchmark_pipelined.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include <array>
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+
+#include "benchmark.hpp"
+#include "benchmark_utils.hpp"
+#include "ngraph/file_util.hpp"
+#include "ngraph/runtime/backend.hpp"
+#include "ngraph/runtime/host_tensor.hpp"
+#include "ngraph/runtime/tensor.hpp"
+#include "ngraph/serializer.hpp"
+#include "ngraph/util.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+class TensorCollection
+{
+public:
+    vector<shared_ptr<runtime::HostTensor>> parameter_data;
+    vector<shared_ptr<runtime::HostTensor>> result_data;
+
+    vector<shared_ptr<runtime::Tensor>> input_tensors;
+    vector<shared_ptr<runtime::Tensor>> output_tensors;
+
+private:
+};
+
+static mutex s_mutex;
+static condition_variable s_condition;
+static size_t current_iteration = 0;
+static size_t s_iterations;
+static size_t s_warmup_iterations;
+static stopwatch s_timer;
+
+static void
+    thread_entry(runtime::Executable* exec, const TensorCollection& tensors, size_t pipeline_stage)
+{
+    bool data_written = false;
+    const vector<shared_ptr<runtime::Tensor>>& args = tensors.input_tensors;
+    const vector<shared_ptr<runtime::Tensor>>& results = tensors.output_tensors;
+    while (current_iteration < s_iterations + s_warmup_iterations)
+    {
+        if (!data_written)
+        {
+            for (size_t arg_index = 0; arg_index < args.size(); arg_index++)
+            {
+                const shared_ptr<runtime::Tensor>& arg = args[arg_index];
+                if (arg->get_stale())
+                {
+                    const shared_ptr<runtime::HostTensor>& data = tensors.parameter_data[arg_index];
+                    arg->write(data->get_data_ptr(),
+                               data->get_element_count() * data->get_element_type().size());
+                }
+            }
+            data_written = true;
+        }
+        unique_lock<mutex> lock(s_mutex);
+        if ((current_iteration & 1) != pipeline_stage)
+        {
+            s_condition.wait(lock);
+        }
+        else
+        {
+            if (current_iteration == s_warmup_iterations)
+            {
+                s_timer.start();
+            }
+            // our turn to run
+            exec->call(results, args);
+            current_iteration++;
+            data_written = false;
+            s_condition.notify_all();
+            lock.unlock();
+            for (size_t result_index = 0; result_index < results.size(); result_index++)
+            {
+                const shared_ptr<runtime::HostTensor>& data = tensors.result_data[result_index];
+                const shared_ptr<runtime::Tensor>& result = results[result_index];
+                result->read(data->get_data_ptr(),
+                             data->get_element_count() * data->get_element_type().size());
+            }
+        }
+    }
+}
+
+vector<runtime::PerformanceCounter> run_benchmark_pipelined(shared_ptr<Function> f,
+                                                            const string& backend_name,
+                                                            size_t iterations,
+                                                            bool timing_detail,
+                                                            int warmup_iterations,
+                                                            bool copy_data)
+{
+    constexpr size_t pipeline_depth = 2;
+    s_iterations = iterations;
+    s_warmup_iterations = warmup_iterations;
+    array<TensorCollection, pipeline_depth> tensor_collections;
+    stopwatch timer;
+    timer.start();
+    auto backend = runtime::Backend::create(backend_name);
+    auto exec = backend->compile(f, timing_detail);
+    timer.stop();
+    cout.imbue(locale(""));
+    cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;
+    set_denormals_flush_to_zero();
+
+    // Create random input data for all input tensors
+    for (size_t i = 0; i < pipeline_depth; i++)
+    {
+        for (shared_ptr<op::Parameter> param : f->get_parameters())
+        {
+            auto tensor_data =
+                make_shared<runtime::HostTensor>(param->get_element_type(), param->get_shape());
+            random_init(tensor_data);
+            tensor_collections[i].parameter_data.push_back(tensor_data);
+        }
+    }
+
+    // Create output tensors for all outputs
+    for (size_t i = 0; i < pipeline_depth; i++)
+    {
+        for (shared_ptr<Node> result : f->get_results())
+        {
+            auto tensor_data =
+                make_shared<runtime::HostTensor>(result->get_element_type(), result->get_shape());
+            tensor_collections[i].result_data.push_back(tensor_data);
+        }
+    }
+
+    // Create input tensors for all Parameters
+    array<vector<shared_ptr<runtime::Tensor>>, pipeline_depth> input_tensors_array;
+    size_t input_index = 0;
+    for (shared_ptr<op::Parameter> param : f->get_parameters())
+    {
+        auto input_tensors = exec->create_input_tensor(input_index++, pipeline_depth);
+        for (size_t i = 0; i < pipeline_depth; i++)
+        {
+            tensor_collections[i].input_tensors.push_back(input_tensors[i]);
+        }
+    }
+
+    // Create output tensors for all Results
+    array<vector<shared_ptr<runtime::Tensor>>, pipeline_depth> output_tensors_array;
+    size_t output_index = 0;
+    for (shared_ptr<Node> result : f->get_results())
+    {
+        auto output_tensors = exec->create_output_tensor(output_index++, pipeline_depth);
+        for (size_t i = 0; i < pipeline_depth; i++)
+        {
+            tensor_collections[i].output_tensors.push_back(output_tensors[i]);
+        }
+    }
+
+    thread threads[pipeline_depth];
+    for (size_t i = 0; i < pipeline_depth; i++)
+    {
+        threads[i] = thread(thread_entry, exec.get(), tensor_collections[i], i);
+    }
+
+    for (size_t i = 0; i < pipeline_depth; i++)
+    {
+        threads[i].join();
+    }
+    s_timer.stop();
+    float time = s_timer.get_milliseconds();
+    cout << time / iterations << "ms per iteration" << endl;
+
+    vector<runtime::PerformanceCounter> perf_data = exec->get_performance_data();
+    return perf_data;
+}
--- a/src/tools/nbench/benchmark_pipelined.hpp
+++ b/src/tools/nbench/benchmark_pipelined.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "ngraph/function.hpp"
+#include "ngraph/runtime/performance_counter.hpp"
+
+std::vector<ngraph::runtime::PerformanceCounter>
+    run_benchmark_pipelined(std::shared_ptr<ngraph::Function> f,
+                            const std::string& backend_name,
+                            size_t iterations,
+                            bool timing_detail,
+                            int warmup_iterations,
+                            bool copy_data);
--- a/src/tools/nbench/benchmark_utils.cpp
+++ b/src/tools/nbench/benchmark_utils.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#if defined(__x86_64__) || defined(__amd64__)
+#include <xmmintrin.h>
+#endif
+
+#include "benchmark_utils.hpp"
+#include "ngraph/file_util.hpp"
+#include "ngraph/runtime/backend.hpp"
+#include "ngraph/runtime/host_tensor.hpp"
+#include "ngraph/runtime/tensor.hpp"
+#include "ngraph/serializer.hpp"
+#include "ngraph/util.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+template <>
+void init_int_tensor<char>(shared_ptr<runtime::Tensor> tensor, char min, char max)
+{
+    size_t size = tensor->get_element_count();
+    uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
+    vector<char> vec(size);
+    for (char& element : vec)
+    {
+        element = static_cast<char>(dist(get_random_engine()));
+    }
+    tensor->write(vec.data(), vec.size() * sizeof(char));
+}
+
+template <>
+void init_int_tensor<int8_t>(shared_ptr<runtime::Tensor> tensor, int8_t min, int8_t max)
+{
+    size_t size = tensor->get_element_count();
+    uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
+    vector<int8_t> vec(size);
+    for (int8_t& element : vec)
+    {
+        element = static_cast<int8_t>(dist(get_random_engine()));
+    }
+    tensor->write(vec.data(), vec.size() * sizeof(int8_t));
+}
+
+template <>
+void init_int_tensor<uint8_t>(shared_ptr<runtime::Tensor> tensor, uint8_t min, uint8_t max)
+{
+    size_t size = tensor->get_element_count();
+    uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
+    vector<uint8_t> vec(size);
+    for (uint8_t& element : vec)
+    {
+        element = static_cast<uint8_t>(dist(get_random_engine()));
+    }
+    tensor->write(vec.data(), vec.size() * sizeof(uint8_t));
+}
+
+void set_denormals_flush_to_zero()
+{
+#if defined(__x86_64__) || defined(__amd64__)
+    // Avoids perf impact from denormals while benchmarking with random data
+    _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+    _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+#endif
+}
+
+void random_init(shared_ptr<runtime::Tensor> tensor)
+{
+    element::Type et = tensor->get_element_type();
+#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic error "-Wswitch"
+#pragma GCC diagnostic error "-Wswitch-enum"
+#endif
+    switch (et.get_type_enum())
+    {
+    case element::Type_t::boolean: init_int_tensor<char>(tensor, 0, 1); break;
+    case element::Type_t::f32: init_real_tensor<float>(tensor, -1, 1); break;
+    case element::Type_t::f64: init_real_tensor<double>(tensor, -1, 1); break;
+    case element::Type_t::i8: init_int_tensor<int8_t>(tensor, -1, 1); break;
+    case element::Type_t::i16: init_int_tensor<int16_t>(tensor, -1, 1); break;
+    case element::Type_t::i32: init_int_tensor<int32_t>(tensor, 0, 1); break;
+    case element::Type_t::i64: init_int_tensor<int64_t>(tensor, 0, 1); break;
+    case element::Type_t::u8: init_int_tensor<uint8_t>(tensor, 0, 1); break;
+    case element::Type_t::u16: init_int_tensor<uint16_t>(tensor, 0, 1); break;
+    case element::Type_t::u32: init_int_tensor<uint32_t>(tensor, 0, 1); break;
+    case element::Type_t::u64: init_int_tensor<uint64_t>(tensor, 0, 1); break;
+    case element::Type_t::undefined:
+    case element::Type_t::dynamic:
+    case element::Type_t::bf16:
+    case element::Type_t::f16:
+    default: throw runtime_error("unsupported type");
+    }
+#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
+#pragma GCC diagnostic pop
+#endif
+}
+
+default_random_engine& get_random_engine()
+{
+    static std::default_random_engine s_random_engine;
+    return s_random_engine;
+}
--- a/src/tools/nbench/benchmark_utils.hpp
+++ b/src/tools/nbench/benchmark_utils.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include <random>
+
+#include "benchmark.hpp"
+#include "ngraph/file_util.hpp"
+#include "ngraph/runtime/backend.hpp"
+#include "ngraph/runtime/host_tensor.hpp"
+#include "ngraph/runtime/tensor.hpp"
+#include "ngraph/serializer.hpp"
+#include "ngraph/util.hpp"
+
+void set_denormals_flush_to_zero();
+
+void random_init(std::shared_ptr<ngraph::runtime::Tensor> tensor);
+
+std::default_random_engine& get_random_engine();
+
+template <typename T>
+void init_int_tensor(std::shared_ptr<ngraph::runtime::Tensor> tensor, T min, T max)
+{
+    size_t size = tensor->get_element_count();
+    std::uniform_int_distribution<T> dist(min, max);
+    std::vector<T> vec(size);
+    for (T& element : vec)
+    {
+        element = dist(get_random_engine());
+    }
+    tensor->write(vec.data(), vec.size() * sizeof(T));
+}
+
+template <typename T>
+void init_real_tensor(std::shared_ptr<ngraph::runtime::Tensor> tensor, T min, T max)
+{
+    size_t size = tensor->get_element_count();
+    std::uniform_real_distribution<T> dist(min, max);
+    std::vector<T> vec(size);
+    for (T& element : vec)
+    {
+        element = dist(get_random_engine());
+    }
+    tensor->write(vec.data(), vec.size() * sizeof(T));
+}
--- a/src/tools/nbench/nbench.cpp
+++ b/src/tools/nbench/nbench.cpp
@@ -24,6 +24,7 @@
 #include <iomanip>

 #include "benchmark.hpp"
+#include "benchmark_pipelined.hpp"
 #include "ngraph/distributed.hpp"
 #include "ngraph/except.hpp"
 #include "ngraph/file_util.hpp"
@@ -181,6 +182,7 @@ int main(int argc, char** argv)
    int warmup_iterations = 1;
    bool copy_data = true;
    bool dot_file = false;
+    bool double_buffer = false;

    for (size_t i = 1; i < argc; i++)
    {
@@ -229,6 +231,10 @@ int main(int argc, char** argv)
        {
            directory = argv[++i];
        }
+        else if (arg == "--double_buffer")
+        {
+            double_buffer = true;
+        }
        else if (arg == "-w" || arg == "--warmup_iterations")
        {
            try
@@ -283,6 +289,7 @@ OPTIONS
        -w|--warmup_iterations    Number of warm-up iterations
        --no_copy_data            Disable copy of input/result data every iteration
        --dot                     Generate Graphviz dot file
+        --double_buffer           Double buffer inputs and outputs
 )###";
        return 1;
    }
@@ -420,8 +427,17 @@ OPTIONS
            {
                cout << "\n---- Benchmark ----\n";
                shared_ptr<Function> f = deserialize(model);
-                auto perf_data = run_benchmark(
-                    f, backend, iterations, timing_detail, warmup_iterations, copy_data);
+                vector<runtime::PerformanceCounter> perf_data;
+                if (double_buffer)
+                {
+                    perf_data = run_benchmark_pipelined(
+                        f, backend, iterations, timing_detail, warmup_iterations, copy_data);
+                }
+                else
+                {
+                    perf_data = run_benchmark(
+                        f, backend, iterations, timing_detail, warmup_iterations, copy_data);
+                }
                auto perf_shape = to_perf_shape(f, perf_data);
                aggregate_perf_data.insert(
                    aggregate_perf_data.end(), perf_shape.begin(), perf_shape.end());