Merge branch 'master' into tsocha/improve-cmake-grama

b50c17bf · Robert Kimball · GitHub · b5549e0d · 1eda1350 · b50c17bf
Unverified Commit b50c17bf authored Jul 24, 2019 by Robert Kimball Committed by GitHub Jul 24, 2019
42 changed files
--- a/doc/examples/abc/abc.cpp
+++ b/doc/examples/abc/abc.cpp
@@ -32,7 +32,7 @@ int main()
    auto t1 = std::make_shared<op::Multiply>(t0, c);
    // Make the function
-    auto f = std::make_shared<Function>(NodeVector{t1},
+    auto f = std::make_shared<Function>(OutputVector{t1},
                                        ParameterVector{a, b, c});
    // Create the backend

--- a/doc/examples/abc_operator/abc_operator.cpp
+++ b/doc/examples/abc_operator/abc_operator.cpp
@@ -31,7 +31,7 @@ int main()
    auto t1 = (a + b) * c;
    // Make the function
-    auto f = std::make_shared<Function>(NodeVector{t1},
+    auto f = std::make_shared<Function>(OutputVector{t1},
                                        ParameterVector{a, b, c});
    // Get the backend

--- a/doc/examples/mnist_mlp/dist_mnist_mlp.cpp
+++ b/doc/examples/mnist_mlp/dist_mnist_mlp.cpp
@@ -175,8 +175,8 @@ int main(int argc, char* argv[])
    auto delta = -learning_rate * loss;
    // Updates
-    ngraph::autodiff::Adjoints adjoints(NodeVector{loss},
+    ngraph::autodiff::Adjoints adjoints(OutputVector{loss},
-                                        NodeVector{delta});
+                                        OutputVector{delta});
    auto grad_W0 = adjoints.backprop_node(W0);
    auto grad_b0 = adjoints.backprop_node(b0);
    auto grad_W1 = adjoints.backprop_node(W1);
@@ -231,7 +231,7 @@ int main(int argc, char* argv[])
    NodeMap train_node_map;
    auto train_function = clone_function(
        Function(
-            NodeVector{loss, softmax, W0_next, b0_next, W1_next, b1_next},
+            OutputVector{loss, softmax, W0_next, b0_next, W1_next, b1_next},
            ParameterVector{X, Y, N, learning_rate, W0, b0, W1, b1}),
        train_node_map);
    auto train_exec = backend->compile(train_function);
@@ -240,7 +240,7 @@ int main(int argc, char* argv[])
    // X, W0, b0, W1, b1 -> softmax
    NodeMap inference_node_map;
    auto inference_function = clone_function(
-        Function(NodeVector{softmax}, ParameterVector{X, W0, b0, W1, b1}),
+        Function(OutputVector{softmax}, ParameterVector{X, W0, b0, W1, b1}),
        inference_node_map);
    auto inference_exec = backend->compile(inference_function);

--- a/doc/examples/mnist_mlp/mnist_mlp.cpp
+++ b/doc/examples/mnist_mlp/mnist_mlp.cpp
@@ -172,8 +172,8 @@ int main(int argc, const char* argv[])
    auto delta = -learning_rate * loss;
    // Updates
-    ngraph::autodiff::Adjoints adjoints(NodeVector{loss},
+    ngraph::autodiff::Adjoints adjoints(OutputVector{loss},
-                                        NodeVector{delta});
+                                        OutputVector{delta});
    auto W0_next = W0 + adjoints.backprop_node(W0);
    auto b0_next = b0 + adjoints.backprop_node(b0);
    auto W1_next = W1 + adjoints.backprop_node(W1);
@@ -218,7 +218,7 @@ int main(int argc, const char* argv[])
    NodeMap train_node_map;
    auto train_function = clone_function(
        Function(
-            NodeVector{loss, softmax, W0_next, b0_next, W1_next, b1_next},
+            OutputVector{loss, softmax, W0_next, b0_next, W1_next, b1_next},
            ParameterVector{X, Y, N, learning_rate, W0, b0, W1, b1}),
        train_node_map);
    auto train_exec = backend->compile(train_function);
@@ -227,7 +227,7 @@ int main(int argc, const char* argv[])
    // X, W0, b0, W1, b1 -> softmax
    NodeMap inference_node_map;
    auto inference_function = clone_function(
-        Function(NodeVector{softmax}, ParameterVector{X, W0, b0, W1, b1}),
+        Function(OutputVector{softmax}, ParameterVector{X, W0, b0, W1, b1}),
        inference_node_map);
    auto inference_exe = backend->compile(inference_function);

--- a/doc/sphinx/source/buildlb.rst
+++ b/doc/sphinx/source/buildlb.rst
@@ -5,7 +5,7 @@ Build and Test
 ###############
 * :ref:`default_ngflags`
-* :ref:`ngraph_plaidml_backend`
+.. :ref:`ngraph_plaidml_backend`
 There are a few common paths to take when manually building the |project| 
 from source code. Today nGraph supports various developers working on all 
@@ -161,17 +161,17 @@ The process documented here will work on CentOS 7.4.
      $ make && sudo make install 
-.. _ngraph_plaidml_backend:
+.. .. _ngraph_plaidml_backend:  hide this until announcement is official
-Building nGraph-PlaidML from source
+.. Building nGraph-PlaidML from source
-===================================
+.. ===================================
-The following instructions will create the ``~/ngraph_plaidml_dist`` 
+.. The following instructions will create the ``~/ngraph_plaidml_dist`` 
-locally:
+.. locally:
-#. Ensure you have installed the :ref:`prerequisites` for your OS.
+.. #. Ensure you have installed the :ref:`prerequisites` for your OS.
-#. Install the prerequisites for the backend. Our hybrid ``NGRAPH_PLAIDML``
+.. #. Install the prerequisites for the backend. Our hybrid ``NGRAPH_PLAIDML``
   backend works best with Python3 versions. We recommend that you use a 
   virtual environment, due to some of the difficulties that users have 
   seen when trying to install outside of a venv.
@@ -182,20 +182,20 @@ locally:
      $ pip install plaidml 
      $ plaidml-setup
-#. Clone the source code, create and enter your build directory:
+.. #. Clone the source code, create and enter your build directory:
   .. code-block:: console
      $ git clone https://github.com/NervanaSystems/ngraph.git
      $ cd ngraph && mkdir build && cd build
-#. Prepare the CMake files as follows: 
+.. #. Prepare the CMake files as follows: 
   .. code-block:: console
      $ cmake .. -DCMAKE_INSTALL_PREFIX=~/ngraph_plaidml_dist -DNGRAPH_CPU_ENABLE=OFF -DNGRAPH_PLAIDML_ENABLE=ON 
-#. Run :command:`make` and ``make install``. Note that if you are building 
+.. #. Run :command:`make` and ``make install``. Note that if you are building 
   outside a local or user path, you may need to run ``make install`` as the 
   root user.
@@ -210,8 +210,8 @@ locally:
   tests can be run when PlaidML devices are available at the machine 
   level.
-For more about working with the PlaidML backend from nGraph, see our 
+.. For more about working with the PlaidML backend from nGraph, see our 
-API documentation :doc:`backends/plaidml-ng-api/index`. 
+.. API documentation :doc:`backends/plaidml-ng-api/index`. 
 macOS\* development

--- a/doc/sphinx/source/core/constructing-graphs/execute.rst
+++ b/doc/sphinx/source/core/constructing-graphs/execute.rst
@@ -99,8 +99,8 @@ Once the graph is built, we need to package it in a ``Function``:
   :lines: 35-36
 The first argument to the constuctor specifies the nodes that the function will 
-return; in this case, the product. A ``NodeVector`` is a vector of shared 
+return; in this case, the product. An ``OutputVector`` is a vector of references to 
-pointers of ``op::Node``.  The second argument specifies the parameters of the 
+outputs of ``op::Node``.  The second argument specifies the parameters of the 
 function, in the order they are to be passed to the compiled function. A 
 ``ParameterVector`` is a vector of shared pointers to ``op::Parameter``. 

--- a/doc/sphinx/source/project/introduction.rst
+++ b/doc/sphinx/source/project/introduction.rst
@@ -62,14 +62,14 @@ more detail and describe how nGraph addresses them.
 Problem 1: Kernel libraries do not support graph-level optimizations
 --------------------------------------------------------------------
-The example diagrams below shows how a deep learning framework, when integrated
+The example diagrams below show how a deep learning framework, when integrated
 with a kernel library, can optimally run each operation in a computational
 graph, but the choice of operations in the graph may not be optimal.
 .. _figure-A:
 .. figure:: ../graphics/kernel-problem-1.png
-   :width: 555px
+   :width: 100%
   :alt:
@@ -95,7 +95,7 @@ diagram.
 .. _figure-B:
 .. figure:: ../graphics/kernel-problem-2.png
-   :width: 555px
+   :width: 100%
   :alt:
 Each framework must be manually integrated with each hardware-specific kernel
@@ -130,7 +130,7 @@ work for what will ultimately be a fragile setup that is costly to maintain.
 .. _figure-C:
 .. figure:: ../graphics/kernel-problem-3.png
-   :width: 555px
+   :width: 100%
   :alt:

--- a/doc/sphinx/source/project/release-notes.rst
+++ b/doc/sphinx/source/project/release-notes.rst
@@ -16,14 +16,19 @@ We are pleased to announce the release of version |version|-doc.
 Core updates for |version|
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
-+ Better PlaidML support
+ More ONNX ops
+ Elementwise divide defaults to Python semantics
+ GenerateMask seed optional
+ Graph visualization improvements
+ Preserve control dependencies in more places
+ GetOutputElement has single input
-Latest doc updates
-~~~~~~~~~~~~~~~~~~
-+ Add instructions how to build ``NGRAPH_PLAIDML`` backend.
+.. Latest doc updates
+.. ~~~~~~~~~~~~~~~~~~
+.. + Add instructions how to build ``NGRAPH_PLAIDML`` backend.
 .. important:: Pre-releases (``-rc-0.*``) have newer features, and are less stable.  
@@ -35,7 +40,6 @@ Changelog on Previous Releases
 0.23
 ----
-+ PlaidML support
 + More ONNX ops
 + Elementwise divide defaults to Python semantics
 + GenerateMask seed optional

--- a/doc/sphinx/source/sitemap.rst
+++ b/doc/sphinx/source/sitemap.rst
 :orphan:
-.. toctree::
+  .. toctree::
-   :caption: Sitemap
+     :includehidden:
-   :maxdepth: 1 
+     frameworks/index
-   frameworks/index
+     project/index
-   python_api/index
+     python_api/index
-   inspection/index
+     inspection/index
-   core/overview
+     core/overview
-   backends/index
+     backends/index
-   project/index
+     project/extras/index
--- a/python/pyngraph/function.cpp
+++ b/python/pyngraph/function.cpp
@@ -23,6 +23,8 @@
 namespace py = pybind11;
+static const char* CAPSULE_NAME = "ngraph_function";
 void regclass_pyngraph_Function(py::module m)
 {
    py::class_<ngraph::Function, std::shared_ptr<ngraph::Function>> function(m, "Function");
@@ -49,4 +51,41 @@ void regclass_pyngraph_Function(py::module m)
            py::cast(self.get_output_shape(0)).attr("__str__")().cast<std::string>();
        return "<" + class_name + ": '" + self.get_friendly_name() + "' (" + shape + ")>";
    });
+    function.def_static("from_capsule", [](py::object* capsule) {
+        // get the underlying PyObject* which is a PyCapsule pointer
+        auto* pybind_capsule_ptr = capsule->ptr();
+        // extract the pointer stored in the PyCapsule under the name CAPSULE_NAME
+        auto* capsule_ptr = PyCapsule_GetPointer(pybind_capsule_ptr, CAPSULE_NAME);
+        auto* ngraph_function = static_cast<std::shared_ptr<ngraph::Function>*>(capsule_ptr);
+        if (ngraph_function)
+        {
+            return *ngraph_function;
+        }
+        else
+        {
+            throw std::runtime_error("The provided capsule does not contain an ngraph::Function");
+        }
+    });
+    function.def_static("to_capsule", [](std::shared_ptr<ngraph::Function>& ngraph_function) {
+        // create a shared pointer on the heap before putting it in the capsule
+        // this secures the lifetime of the object transferred by the capsule
+        auto* sp_copy = new std::shared_ptr<ngraph::Function>(ngraph_function);
+        // a destructor callback that will delete the heap allocated shared_ptr
+        // when the capsule is destructed
+        auto sp_deleter = [](PyObject* capsule) {
+            auto* capsule_ptr = PyCapsule_GetPointer(capsule, CAPSULE_NAME);
+            auto* function_sp = static_cast<std::shared_ptr<ngraph::Function>*>(capsule_ptr);
+            if (function_sp)
+            {
+                delete function_sp;
+            }
+        };
+        // put the shared_ptr in a new capsule under the same name as in "from_capsule"
+        auto pybind_capsule = py::capsule(sp_copy, CAPSULE_NAME, sp_deleter);
+        return pybind_capsule;
+    });
 }
--- a/src/contrib/mlir/compiler.cpp
+++ b/src/contrib/mlir/compiler.cpp
@@ -30,6 +30,7 @@
 #include "ngraph/op/divide.hpp"
 #include "ngraph/op/dot.hpp"
 #include "ngraph/op/experimental/compiled_kernel.hpp"
+#include "ngraph/op/gather.hpp"
 #include "ngraph/op/greater.hpp"
 #include "ngraph/op/less.hpp"
 #include "ngraph/op/maximum.hpp"
@@ -63,6 +64,7 @@
 using llvm::SmallVector;
 using llvm::StringRef;
 using llvm::make_unique;
+using llvm::ArrayRef;
 using namespace ngraph::runtime::ngmlir;
@@ -282,11 +284,20 @@ void MLIRCompiler::build_ng_dialect()
            throw unsupported_op{std::string{"The MLIR backend doesn't currently implement the '"} +
                                 np->description() + "' operation"};
        }
-        mlir::Value* mlir_value = it->second(*this, np.get());
+        mlir::Operation* op = it->second(*this, np.get());
-        // builders that have multiple result values will update the value map, and set their ret values to null
+        // This assumes simple 1:1 mapping between output edges and generated MLIR op results
-        if (mlir_value)
+        // If the mapping is more complex, the create_op helper can return null operation
+        // and handles populating the value map itself
+        if (op)
        {
-            update_tensor_value(np->get_output_tensor_ptr().get(), mlir_value);
+            for (auto i = 0; i < op->getNumResults(); i++)
+            {
+                mlir::Value* result = op->getResult(i);
+                if (result)
+                {
+                    update_tensor_value(np->get_output_tensor_ptr(i).get(), result);
+                }
+            }
        }
    }
    create_return();
@@ -299,133 +310,125 @@ namespace ngraph
        namespace ngmlir
        {
            template <>
-            mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Add)
+            mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Add)
            {
-                return compiler.create_binary_op<mlir::NGAddOp>(ng_node);
+                return compiler.create_generic_op<mlir::NGAddOp>(ng_node);
            }
            template <>
-            mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Subtract)
+            mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Subtract)
            {
-                return compiler.create_binary_op<mlir::NGSubOp>(ng_node);
+                return compiler.create_generic_op<mlir::NGSubOp>(ng_node);
            }
            template <>
-            mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Multiply)
+            mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Multiply)
            {
-                return compiler.create_binary_op<mlir::NGMulOp>(ng_node);
+                return compiler.create_generic_op<mlir::NGMulOp>(ng_node);
            }
            template <>
-            mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Divide)
+            mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Divide)
            {
-                return compiler.create_binary_op<mlir::NGDivOp>(ng_node);
+                return compiler.create_generic_op<mlir::NGDivOp>(ng_node);
            }
            template <>
-            mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Greater)
+            mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Greater)
            {
-                return compiler.create_binary_op<mlir::NGGreaterOp>(ng_node);
+                return compiler.create_generic_op<mlir::NGGreaterOp>(ng_node);
            }
            template <>
-            mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Less)
+            mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Less)
            {
-                return compiler.create_binary_op<mlir::NGLessOp>(ng_node);
+                return compiler.create_generic_op<mlir::NGLessOp>(ng_node);
            }
            template <>
-            mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Maximum)
+            mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Maximum)
            {
-                return compiler.create_binary_op<mlir::NGMaxOp>(ng_node);
+                return compiler.create_generic_op<mlir::NGMaxOp>(ng_node);
            }
            template <>
-            mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Minimum)
+            mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Minimum)
            {
-                return compiler.create_binary_op<mlir::NGMinOp>(ng_node);
+                return compiler.create_generic_op<mlir::NGMinOp>(ng_node);
            }
            template <>
-            mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::ArgMax)
+            mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::ArgMax)
            {
                return compiler.create_index_reduction<mlir::NGArgMaxRedOp>(ng_node);
            }
            template <>
-            mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::ArgMin)
+            mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::ArgMin)
            {
                return compiler.create_index_reduction<mlir::NGArgMinRedOp>(ng_node);
            }
            template <>
-            mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Dot)
+            mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Dot)
            {
-                return compiler.create_binary_op<mlir::NGDotOp>(ng_node);
+                return compiler.create_generic_op<mlir::NGDotOp>(ng_node);
            }
            template <>
-            mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Concat)
+            mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Concat)
            {
-                return compiler.create_concat(ng_node);
+                auto ng_node_concat = static_cast<const ngraph::op::Concat*>(ng_node);
+                auto op = compiler.create_generic_op<mlir::NGConcatOp>(ng_node);
+                op->setAttr("concatenation_axis",
+                            compiler.m_builder->getI64IntegerAttr(
+                                ng_node_concat->get_concatenation_axis()));
+                return op;
            }
            template <>
-            mlir::Value* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Relu)
+            mlir::Operation* MLIRCompiler::COMPILE_OP_DECL(ngraph::op::Gather)
            {
-                return compiler.create_unary_op<mlir::NGReluOp>(ng_node);
+                auto ng_node_gather = static_cast<const ngraph::op::Gather*>(ng_node);
+                auto op = compiler.create_generic_op<mlir::NGGatherOp>(ng_node);
+                op->setAttr("axis",
+                            compiler.m_builder->getI64IntegerAttr(ng_node_gather->get_axis()));
+                return op;
            }
        }
    }
 }
-const MLIRCompiler::MLIRCompOpMap MLIRCompiler::op_dispatcher{
+template <typename Op>
-#define MLIR_OP(OP) {TI(ngraph::op::OP), &MLIRCompiler::create_op<ngraph::op::OP>},
+mlir::Operation* MLIRCompiler::create_generic_op(const ngraph::Node* ng_node)
-#include "ops_supported.inc"
-};
-template <typename UnaryOp>
-mlir::Value* MLIRCompiler::create_unary_op(const ngraph::Node* ng_node)
-{
-    auto lhs = ng_node->get_argument(0)->get_output_tensor_ptr();
-    auto lhs_v = get_tensor_value(lhs.get()).m_value;
-    auto res_type = get_mlir_type(ng_node->get_output_tensor_ptr().get());
-    return m_builder->create<UnaryOp>(mlir::UnknownLoc::get(&m_context), res_type, lhs_v)
-        .getResult();
-}
-template <typename BinOp>
-mlir::Value* MLIRCompiler::create_binary_op(const ngraph::Node* ng_node)
-{
-    auto lhs = ng_node->get_argument(0)->get_output_tensor_ptr();
-    auto rhs = ng_node->get_argument(1)->get_output_tensor_ptr();
-    auto lhs_v = get_tensor_value(lhs.get()).m_value;
-    auto rhs_v = get_tensor_value(rhs.get()).m_value;
-    auto res_type = get_mlir_type(ng_node->get_output_tensor_ptr().get());
-    return m_builder->create<BinOp>(mlir::UnknownLoc::get(&m_context), res_type, lhs_v, rhs_v)
-        .getResult();
-}
-mlir::Value* MLIRCompiler::create_concat(const ngraph::Node* ng_node)
 {
    std::vector<mlir::Value*> arg_values;
-    auto ng_node_concat = static_cast<const ngraph::op::Concat*>(ng_node);
+    std::vector<mlir::Type> res_types;
    for (auto& arg : ng_node->get_arguments())
    {
        auto arg_tensor = arg->get_output_tensor_ptr();
        auto arg_v = get_tensor_value(arg_tensor.get()).m_value;
        arg_values.push_back(arg_v);
    }
-    auto res_type = get_mlir_type(ng_node->get_output_tensor_ptr().get());
-    return m_builder
+    for (auto& output : ng_node->outputs())
-        ->create<mlir::NGConcatOp>(
+    {
-            mlir::UnknownLoc::get(&m_context),
+        res_types.push_back(get_mlir_type(output.get_tensor_ptr().get()));
-            res_type,
+    }
-            arg_values,
-            m_builder->getI64IntegerAttr(ng_node_concat->get_concatenation_axis()))
+    return (m_builder->create<Op,
-        .getResult();
+                              ArrayRef<mlir::Type>,
+                              ArrayRef<mlir::Value*>,
+                              ArrayRef<mlir::NamedAttribute>>(
+                mlir::UnknownLoc::get(&m_context), res_types, arg_values, {/* no attrs */}))
+        .getOperation();
 }
+const MLIRCompiler::MLIRCompOpMap MLIRCompiler::op_dispatcher{
+#define MLIR_OP(OP) {TI(ngraph::op::OP), &MLIRCompiler::create_op<ngraph::op::OP>},
+#include "ops_supported.inc"
+};
 void MLIRCompiler::create_return()
 {
    std::vector<mlir::Value*> value_list;
@@ -437,21 +440,16 @@ void MLIRCompiler::create_return()
 }
 template <typename RedOp>
-mlir::Value* MLIRCompiler::create_index_reduction(const ngraph::Node* ng_node)
+mlir::Operation* MLIRCompiler::create_index_reduction(const ngraph::Node* ng_node)
 {
    auto* idx_red = static_cast<const ngraph::op::util::IndexReduction*>(ng_node);
+    auto op = create_generic_op<RedOp>(ng_node);
-    auto arg = idx_red->get_argument(0);
+    mlir::ArrayAttr red_axes_attr =
-    size_t red_axis = idx_red->get_reduction_axis();
+        m_builder->getI64ArrayAttr({(int64_t)idx_red->get_reduction_axis()});
+    op->setAttr("axes", red_axes_attr);
-    mlir::Value* arg_val = get_tensor_value(arg->get_output_tensor_ptr().get()).m_value;
+    return op;
-    mlir::ArrayAttr red_axes_attr = m_builder->getI64ArrayAttr({(int64_t)red_axis});
-    return m_builder
-        ->create<RedOp>(
-            mlir::UnknownLoc::get(&m_context), get_mlir_type(ng_node), arg_val, red_axes_attr)
-        .getResult();
 }
 // Binds MLIR function arguments to the proper values. This includes externally allocated tensors
 // helpers to be used inside the function.
 void MLIRCompiler::bind_arguments()

--- a/src/contrib/mlir/compiler.hpp
+++ b/src/contrib/mlir/compiler.hpp
@@ -98,25 +98,21 @@ namespace ngraph
                void build_ng_dialect();
-                template <typename OP>
+                template <typename Op>
-                static mlir::Value* create_op(MLIRCompiler& compiler, const ngraph::Node* ng_node)
+                static mlir::Operation* create_op(MLIRCompiler& compiler,
+                                                  const ngraph::Node* ng_node)
                {
                    throw std::runtime_error("Unimplemented op '" + ng_node->description() +
                                             "' in MLIR Compiler");
                }
-                template <typename UnaryOp>
+                // Generic op lowerer to ng dialect.
-                mlir::Value* create_unary_op(const ngraph::Node* ng_node);
+                // Simply maps ngraph tensors to values and generate an OP. No op-specific logic.
+                template <typename Op>
-                template <typename BinOp>
+                mlir::Operation* create_generic_op(const ngraph::Node* ng_node);
-                mlir::Value* create_binary_op(const ngraph::Node* ng_node);
-                // TODO(amprocte): Can we have a create_variadic_op that is able to handle the
-                // attributes?
-                mlir::Value* create_concat(const ngraph::Node* ng_node);
                template <typename RedOp>
-                mlir::Value* create_index_reduction(const ngraph::Node* ng_node);
+                mlir::Operation* create_index_reduction(const ngraph::Node* ng_node);
                void create_return();
@@ -150,7 +146,7 @@ namespace ngraph
                using TensorToInfo = std::pair<descriptor::Tensor*, TensorInfo>;
                using TensorToInfoMap = std::unordered_map<descriptor::Tensor*, TensorInfo>;
                using MLIRCompOpFunction =
-                    std::function<mlir::Value*(MLIRCompiler& compiler, const ngraph::Node*)>;
+                    std::function<mlir::Operation*(MLIRCompiler& compiler, const ngraph::Node*)>;
                using MLIRCompOpMap = std::unordered_map<std::type_index, MLIRCompOpFunction>;
                // Maps tensor to the value it represents in the IR

--- a/src/contrib/mlir/dialect/ops.cpp
+++ b/src/contrib/mlir/dialect/ops.cpp
@@ -168,6 +168,39 @@ static mlir::LogicalResult verifyCmpOp(T* op)
    return mlir::success();
 }
+template <>
+mlir::LogicalResult verifyOp(NGGatherOp* op)
+{
+    Type ty = op->params()->getType();
+    NGTensorType inputType = ty.cast<NGTensorType>();
+    ty = op->indices()->getType();
+    NGTensorType indicesType = ty.cast<NGTensorType>();
+    // ensure axis < params rank
+    if (op->axis().getSExtValue() >= inputType.getRank())
+        return op->emitOpError("Gather axis is larger than input rank");
+    ty = indicesType.getElementType();
+    // ensure indices are I32 or I64
+    if (!ty.isa<NGIntegerType>())
+        return op->emitOpError("Indices tensor is not of Integer type");
+    NGIntegerType indicesEltType = ty.cast<NGIntegerType>();
+    if (!indicesEltType.isInt32() && !indicesEltType.isInt64())
+        return op->emitOpError("Indices tensor is not of I32 or I64 type");
+    mlir::Type r0 = op->res()->getType();
+    NGTensorType resType = r0.cast<NGTensorType>();
+    // ensure result is compatible with input
+    if (!resType.getRank() == inputType.getRank() + indicesType.getRank() - 1)
+        return op->emitOpError("Incompatible result shape and/or type");
+    return mlir::success();
+}
 namespace mlir
 {
 #define GET_OP_CLASSES

--- a/src/contrib/mlir/dialect/ops.td
+++ b/src/contrib/mlir/dialect/ops.td
@@ -186,8 +186,8 @@ def NGDotOp : NG_Binary_Op<"dot">
 // class, but I'm not sure how to add concatenation_axis into the args if we
 // do that.
 def NGConcatOp :
-      NG_OneResult_Op<"concat", [NoSideEffect]>,
+    NG_OneResult_Op<"concat", [NoSideEffect]>,
-      Arguments<(ins Variadic<NG_TensorType>:$args, I64Attr:$concatenation_axis)>
+    Arguments<(ins Variadic<NG_TensorType>:$args, I64Attr:$concatenation_axis)>
 {
  let parser = [{ NGRAPH_CHECK(false, "No parser support"); return mlir::failure(); }];
@@ -200,7 +200,7 @@ class NG_Axis_Reduction_Op<string mnemonic, list<OpTrait> traits = []> :
 {
  let summary = "Base class for reduction operations that perform a reduction "
                "across the axes of a  single tensor.";
-  let description = "Axes are represented as an array of I64 attributes.";
+  let description = [{Axes are represented as an array of I64 attributes.}];
  let parser = [{ NGRAPH_CHECK(false, "No parser support"); return mlir::failure(); }];
@@ -257,6 +257,24 @@ def NGAnyRedOp : NG_Axis_Reduction_Op<"any.red">
  let verifier = [{ return verifyLogicalReductionOp(this); }];
 }
+// Gather
+def NGGatherOp : 
+    NG_OneResult_Op<"gather", [NoSideEffect]>,
+    Arguments<(ins NG_TensorType:$params, NG_TensorType:$indices, I64Attr:$axis)>
+{
+  let summary = "Gather slices from params along the specified axis according to indices";
+  let description = [{
+    Gather slices from axis of params according to indices
+    params The tensor from which slices are gathered
+    indices Index tensor. Data type must be `element::i32` or `element::i64`
+    axis Axis in params to gather
+  }];
+  let parser = [{ NGRAPH_CHECK(false, "No parser support"); return mlir::failure(); }];
+  let verifier = [{ return verifyOp(this); }];
+}
 // Terminator Ops
 def NGReturnOp : NG_Terminator_Op<"return">;

--- a/src/contrib/mlir/dialect/type.hpp
+++ b/src/contrib/mlir/dialect/type.hpp
@@ -199,6 +199,7 @@ namespace mlir
        }
        Shape getShape() const { return m_shape; }
+        int64_t getRank() const { return m_shape.size(); }
        EltType getElementType() const { return m_eltType; }
    private:
        NGTensorTypeStorage(EltType eltType, Shape shape)

--- a/src/contrib/mlir/lowerer.cpp
+++ b/src/contrib/mlir/lowerer.cpp
@@ -646,6 +646,123 @@ namespace
        return matchSuccess();
    }
+    REWRITER(NGGatherOp)
+    {
+        auto gatherOp = cast<NGGatherOp>(op);
+        auto loc = gatherOp.getLoc();
+        ScopedContext scope(rewriter, loc);
+        // Get operands
+        Value* result = m_pass.buildOutputDefs(op, rewriter)[0];
+        NGRAPH_CHECK(result, "Unexpected null result in GatherOp");
+        auto resultTy = result->getType().cast<MemRefType>();
+        Value* params = operands[0];
+        Value* indices = operands[1];
+        auto axis = gatherOp.axis().getSExtValue();
+        // Create view to write into result.
+        MemRefView vRes(result), vParams(params), vIndices(indices);
+        // Indexed Values
+        IndexedValue iRes(result), iParams(params), iIndices(indices);
+        // Construct outer loop for params dims. Exclude the axis dim.
+        SmallVector<ValueHandle, 4> paramsLbs, paramsUbs;
+        SmallVector<IndexHandle, 4> paramsIVs;
+        SmallVector<int64_t, 4> paramsSteps;
+        SmallVector<ValueHandle*, 4> paramsIVPtrs;
+        for (auto i = 0; i < vParams.rank(); i++)
+        {
+            // skip gather axis
+            if (i == axis)
+                continue;
+            paramsLbs.push_back(IndexHandle(vParams.lb(i)));
+            paramsUbs.push_back(IndexHandle(vParams.ub(i)));
+            paramsSteps.push_back(vParams.step(i));
+        }
+        NGRAPH_CHECK(paramsLbs.size() == vParams.rank() - 1 &&
+                         paramsUbs.size() == paramsLbs.size() &&
+                         paramsSteps.size() == paramsLbs.size(),
+                     "Incorrect loop nest bounds size for gather params");
+        paramsIVs = IndexHandle::makeIndexHandles(vParams.rank() - 1);
+        paramsIVPtrs = IndexHandle::makeIndexHandlePointers(paramsIVs);
+        auto indicesLbs = vIndices.getLbs();
+        auto indicesUbs = vIndices.getUbs();
+        auto indicesSteps = vIndices.getSteps();
+        auto indicesIVs = IndexHandle::makeIndexHandles(vIndices.rank());
+        auto indicesIVPtrs = IndexHandle::makeIndexHandlePointers(indicesIVs);
+        SmallVector<IndexHandle, 8> paramsIndices, resIndices;
+        // Make sure we are going to create loops
+        NGRAPH_CHECK(vParams.rank() > 0, "Invalid size for indices steps");
+        // Let params rank : N
+        // Let indices rank : M
+        // Let axis be A
+        // Generate
+        // params loops
+        // for P_0: 0 -> params.dim[0]
+        //   for P_1: 0 -> params.dim[1]
+        //     for P_2: 0 -> params.dim[2]
+        // ...
+        //       for P_(A-1):0 -> params.dim[A-1]
+        //         for P_(A+1):0 -> params.dim[A+1]
+        // ...
+        //           for P_(N-1):0 -> params.dim[N-1]
+        //             indices loops
+        //             for I_0:0 -> indices.dim[0]
+        // ...
+        //               for I_(M-1):0 -> indices.dim[M-1]
+        //                 res[P_0, P_1, .. P_(A-1), I_0, .., I_(M-1), P_(A+1), ... P_(N-1)] =
+        //                   params[P_0, P_1, .. P_(A-1), indices[I_0, .., I_(M-1)], P_(A+1), ... P_(N-1)];
+        LoopNestBuilder(paramsIVPtrs, paramsLbs, paramsUbs, paramsSteps)([&] {
+            LoopNestBuilder(indicesIVPtrs, indicesLbs, indicesUbs, indicesSteps)([&] {
+                // Load axis value from indices array and cast it to Index Type
+                ValueHandle axisIdx = ValueHandle::create<IndexCastOp>(
+                    (ValueHandle)iIndices(indicesIVs), rewriter.getIndexType());
+                // construct indices for param
+                // [P_0, P_1, .. P_axis-1, Indices[I0, I1, .. I_k-1], P_axis+1, P_axis+2, .. P_n-1]
+                for (auto i = 0, j = 0; i < vParams.rank(); i++)
+                {
+                    if (i == axis)
+                    {
+                        paramsIndices.push_back(IndexHandle(axisIdx));
+                    }
+                    else
+                    {
+                        paramsIndices.push_back(paramsIVs[j++]);
+                    }
+                }
+                // construct indices for result
+                // [P_0, P_1, .. P_axis-1, I0, I1, .. I_k-1, P_axis+1, P_axis+2, .. P_n-1]
+                for (auto i = 0, j = 0; i < vParams.rank() + vIndices.rank() - 1;)
+                {
+                    if (i == axis && indicesIVs.size() > 0)
+                    {
+                        resIndices.append(indicesIVs.begin(), indicesIVs.end());
+                        i += indicesIVs.size();
+                    }
+                    else
+                    {
+                        resIndices.push_back(paramsIVs[j++]);
+                        i++;
+                    }
+                }
+                // Store into result
+                iRes(resIndices) = iParams(paramsIndices);
+            });
+        });
+        rewriter.replaceOp(op, {result});
+        return matchSuccess();
+    }
    REWRITER(NGReturnOp)
    {
        rewriter.replaceOpWithNewOp<ReturnOp>(op);
@@ -653,7 +770,7 @@ namespace
    }
 #undef REWRITER
+    /// End of pattern matchers
    template <typename OP>
    void lower_binary_elementwise(Operation* op,
                                  ArrayRef<Value*> operands,

--- a/src/contrib/mlir/op_lowerers.inc
+++ b/src/contrib/mlir/op_lowerers.inc
@@ -29,6 +29,7 @@ MLIR_OP(NGArgMinRedOp)
 MLIR_OP(NGConcatOp)
 MLIR_OP(NGDivOp)
 MLIR_OP(NGDotOp)
+MLIR_OP(NGGatherOp)
 MLIR_OP(NGGreaterOp)
 MLIR_OP(NGLessOp)
 MLIR_OP(NGMulOp)

--- a/src/contrib/mlir/ops_supported.inc
+++ b/src/contrib/mlir/ops_supported.inc
@@ -9,6 +9,7 @@ MLIR_OP(ArgMax)
 MLIR_OP(Divide)
 MLIR_OP(Dot)
 MLIR_OP(Concat)
+MLIR_OP(Gather)
 MLIR_OP(Greater)
 MLIR_OP(Less)
 MLIR_OP(Maximum)

--- a/src/contrib/mlir/pass/mlir_subgraph_extraction.cpp
+++ b/src/contrib/mlir/pass/mlir_subgraph_extraction.cpp
@@ -25,6 +25,7 @@
 #include "ngraph/op/divide.hpp"
 #include "ngraph/op/dot.hpp"
 #include "ngraph/op/experimental/compiled_kernel.hpp"
+#include "ngraph/op/gather.hpp"
 #include "ngraph/op/get_output_element.hpp"
 #include "ngraph/op/greater.hpp"
 #include "ngraph/op/less.hpp"

--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -463,6 +463,8 @@ set (SRC
    runtime/backend.hpp
    runtime/backend_manager.cpp
    runtime/backend_manager.hpp
+    runtime/chrome_trace.cpp
+    runtime/chrome_trace.hpp
    runtime/executable.cpp
    runtime/executable.hpp
    runtime/host_tensor.cpp

--- a/src/ngraph/autodiff/adjoints.cpp
+++ b/src/ngraph/autodiff/adjoints.cpp
@@ -51,11 +51,6 @@ OutputVector make_zeros(std::shared_ptr<Node> x)
    return zeros;
 }
-autodiff::Adjoints::Adjoints(const NodeVector& ys, const NodeVector& cs)
-    : Adjoints(OutputVector(ys.begin(), ys.end()), OutputVector(cs.begin(), cs.end()))
-{
-}
 autodiff::Adjoints::Adjoints(const OutputVector& ys, const OutputVector& cs)
 {
    if (ys.size() != cs.size())

--- a/src/ngraph/autodiff/adjoints.hpp
+++ b/src/ngraph/autodiff/adjoints.hpp
@@ -46,8 +46,6 @@ namespace ngraph
            /// \param c An expression for where to evaluate the derivatives
            Adjoints(const OutputVector& y, const OutputVector& c);
-            Adjoints(const NodeVector& y, const NodeVector& c);
            Adjoints(const Adjoints& adjoints) = default;
            Adjoints& operator=(const Adjoints& adjoints) = default;
            Adjoints() = default;

--- a/src/ngraph/function.cpp
+++ b/src/ngraph/function.cpp
@@ -41,6 +41,30 @@ Function::Function(const ResultVector& results,
    init();
 }
+Function::Function(const OutputVector& results,
+                   const ParameterVector& parameters,
+                   const std::string& name)
+    : m_results(results.size())
+    , m_parameters(parameters)
+    , m_temporary_pool_size(0)
+    , m_instance_id(m_next_instance_id.fetch_add(1))
+    , m_name(name)
+    , m_unique_name("Function_" + to_string(m_instance_id))
+{
+    if (std::any_of(results.cbegin(), results.cend(), [](Output<Node> n) {
+            return std::dynamic_pointer_cast<op::Result>(n.get_node_shared_ptr());
+        }))
+    {
+        throw ngraph_error(
+            " Results already contain op::Results. Use a c-tor that takes a ResultVector");
+    }
+    std::transform(results.begin(), results.end(), m_results.begin(), [](Output<Node> n) {
+        return std::make_shared<op::Result>(n);
+    });
+    init();
+}
 Function::Function(const NodeVector& results,
                   const ParameterVector& parameters,
                   const std::string& name)
@@ -208,6 +232,11 @@ shared_ptr<Node> Function::get_output_op(size_t i) const
    return m_results.at(i);
 }
+Output<Node> Function::output(size_t i) const
+{
+    return m_results.at(i);
+}
 shared_ptr<Node> Function::get_result() const
 {
    if (m_results.size() != 1)

--- a/src/ngraph/function.hpp
+++ b/src/ngraph/function.hpp
@@ -37,6 +37,10 @@ namespace ngraph
                 const ParameterVector& parameters,
                 const std::string& name = "");
+        Function(const OutputVector& results,
+                 const ParameterVector& parameters,
+                 const std::string& name = "");
        Function(const std::shared_ptr<Node>& result,
                 const ParameterVector& parameters,
                 const std::string& name = "");
@@ -55,6 +59,8 @@ namespace ngraph
        /// Return the op that generates output i
        std::shared_ptr<Node> get_output_op(size_t i) const;
+        Output<Node> output(size_t i) const;
        /// Return the element type of output i
        const element::Type& get_output_element_type(size_t i) const;

--- a/src/ngraph/runtime/chrome_trace.cpp
+++ b/src/ngraph/runtime/chrome_trace.cpp
+//*****************************************************************************
+// Copyright 2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include <iostream>
+#include <map>
+#include <sstream>
+#include <string>
+#include "chrome_trace.hpp"
+#include "ngraph/log.hpp"
+using namespace std;
+using namespace ngraph;
+static bool read_tracing_env_var()
+{
+    static const bool is_enabled = (getenv("NGRAPH_ENABLE_TRACING") != nullptr);
+    return is_enabled;
+}
+mutex runtime::event::Manager::s_file_mutex;
+bool runtime::event::Manager::s_tracing_enabled = read_tracing_env_var();
+runtime::event::Duration::Duration(const string& name, const string& category, const string& args)
+{
+    if (Manager::is_tracing_enabled())
+    {
+        m_start = Manager::get_current_microseconds();
+        m_stop = 0;
+        m_name = name;
+        m_category = category;
+        m_args = args;
+    }
+}
+void runtime::event::Duration::stop()
+{
+    if (Manager::is_tracing_enabled())
+    {
+        m_stop = Manager::get_current_microseconds();
+    }
+}
+void runtime::event::Duration::write()
+{
+    if (Manager::is_tracing_enabled())
+    {
+        size_t stop_time = (m_stop != 0 ? m_stop : Manager::get_current_microseconds());
+        lock_guard<mutex> lock(Manager::get_mutex());
+        ofstream& out = runtime::event::Manager::get_output_stream();
+        if (out.is_open() == false)
+        {
+            runtime::event::Manager::open();
+        }
+        else
+        {
+            Manager::get_output_stream() << ",\n";
+        }
+        Manager::get_output_stream() <<
+            R"({"name":")" << m_name << R"(","cat":")" << m_category << R"(","ph":"X","pid":)"
+                                     << Manager::get_process_id() << R"(,"tid":)"
+                                     << Manager::get_thread_id() <<
+            R"(,"ts":)" << m_start << R"(,"dur":)" << (stop_time - m_start);
+        if (!m_args.empty())
+        {
+            out <<
+                R"(,"args":)" << m_args;
+        }
+        out << "}";
+    }
+}
+runtime::event::Object::Object(const string& name, const string& args)
+    : m_name{name}
+    , m_id{static_cast<size_t>(chrono::high_resolution_clock::now().time_since_epoch().count())}
+{
+    if (Manager::is_tracing_enabled())
+    {
+        lock_guard<mutex> lock(Manager::get_mutex());
+        ofstream& out = runtime::event::Manager::get_output_stream();
+        if (out.is_open() == false)
+        {
+            runtime::event::Manager::open();
+        }
+        else
+        {
+            Manager::get_output_stream() << ",\n";
+        }
+        out << R"({"name":")" << m_name << R"(","ph":"N","id":")" << m_id <<
+            R"(","ts":)" << Manager::get_current_microseconds() <<
+            R"(,"pid":)" << Manager::get_process_id() << R"(,"tid":)" << Manager::get_thread_id();
+        if (!args.empty())
+        {
+            out <<
+                R"(,"args":)" << args;
+        }
+        out << "}";
+        write_snapshot(out, args);
+    }
+}
+void runtime::event::Object::snapshot(const string& args)
+{
+    if (Manager::is_tracing_enabled())
+    {
+        lock_guard<mutex> lock(Manager::get_mutex());
+        ofstream& out = runtime::event::Manager::get_output_stream();
+        if (out.is_open() == false)
+        {
+            runtime::event::Manager::open();
+        }
+        else
+        {
+            Manager::get_output_stream() << ",\n";
+        }
+        write_snapshot(out, args);
+    }
+}
+void runtime::event::Object::write_snapshot(ostream& out, const string& args)
+{
+    out << R"({"name":")" << m_name << R"(","ph":"O","id":")" << m_id <<
+        R"(","ts":)" << Manager::get_current_microseconds() <<
+        R"(,"pid":)" << Manager::get_process_id() << R"(,"tid":)" << Manager::get_thread_id();
+    if (!args.empty())
+    {
+        out <<
+            R"(,"args":)" << args;
+    }
+    out << "}";
+}
+void runtime::event::Object::destroy()
+{
+    if (Manager::is_tracing_enabled())
+    {
+        lock_guard<mutex> lock(Manager::get_mutex());
+        ofstream& out = runtime::event::Manager::get_output_stream();
+        if (out.is_open() == false)
+        {
+            runtime::event::Manager::open();
+        }
+        else
+        {
+            Manager::get_output_stream() << ",\n";
+        }
+        out << R"({"name":")" << m_name << R"(","ph":"D","id":")" << m_id <<
+            R"(","ts":)" << Manager::get_current_microseconds() <<
+            R"(,"pid":)" << Manager::get_process_id() << R"(,"tid":)" << Manager::get_thread_id()
+            << "}";
+    }
+}
+void runtime::event::Manager::open(const string& path)
+{
+    ofstream& out = get_output_stream();
+    if (out.is_open() == false)
+    {
+        out.open(path, ios_base::trunc);
+        out << "[\n";
+    }
+}
+void runtime::event::Manager::close()
+{
+    ofstream& out = get_output_stream();
+    if (out.is_open())
+    {
+        out << "\n]\n";
+        out.close();
+    }
+}
+ofstream& runtime::event::Manager::get_output_stream()
+{
+    static ofstream s_event_log;
+    return s_event_log;
+}
+const string& runtime::event::Manager::get_process_id()
+{
+    static const string s_pid = to_string(getpid());
+    return s_pid;
+}
+void runtime::event::Manager::enable_event_tracing()
+{
+    s_tracing_enabled = true;
+}
+void runtime::event::Manager::disable_event_tracing()
+{
+    s_tracing_enabled = false;
+}
+bool runtime::event::Manager::is_event_tracing_enabled()
+{
+    return s_tracing_enabled;
+}
+string runtime::event::Manager::get_thread_id()
+{
+    thread::id tid = this_thread::get_id();
+    static map<thread::id, string> tid_map;
+    auto it = tid_map.find(tid);
+    string rc;
+    if (it == tid_map.end())
+    {
+        stringstream ss;
+        ss << "\"" << tid << "\"";
+        rc = ss.str();
+        tid_map.insert({tid, rc});
+    }
+    else
+    {
+        rc = it->second;
+    }
+    return rc;
+}
--- a/src/ngraph/runtime/chrome_trace.hpp
+++ b/src/ngraph/runtime/chrome_trace.hpp
+//*****************************************************************************
+// Copyright 2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <mutex>
+#include <string>
+#include <thread>
+#ifdef _WIN32
+#include <windows.h>
+// windows.h must be before processthreadsapi.h so we need this comment
+#include <processthreadsapi.h>
+#define getpid() GetCurrentProcessId()
+#else
+#include <unistd.h>
+#endif
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace event
+        {
+            class Duration;
+            class Object;
+            class Manager;
+        }
+    }
+}
+//
+// This class records timestamps for a given user defined event and
+// produces output in the chrome tracing format that can be used to view
+// the events of a running program
+//
+// Following is the format of a trace event
+//
+// {
+//   "name": "myName",
+//   "cat": "category,list",
+//   "ph": "B",
+//   "ts": 12345,
+//   "pid": 123,
+//   "tid": 456,
+//   "args": {
+//     "someArg": 1,
+//     "anotherArg": {
+//       "value": "my value"
+//     }
+//   }
+// }
+//
+// The trace file format is defined here:
+// https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview
+//
+// The trace file can be viewed by Chrome browser using the
+// URL: chrome://tracing/
+//
+// More information about this is at:
+// http://dev.chromium.org/developers/how-tos/trace-event-profiling-tool
+class ngraph::runtime::event::Manager
+{
+    friend class Duration;
+    friend class Object;
+public:
+    static void open(const std::string& path = "runtime_event_trace.json");
+    static void close();
+    static bool is_tracing_enabled() { return s_tracing_enabled; }
+    static void enable_event_tracing();
+    static void disable_event_tracing();
+    static bool is_event_tracing_enabled();
+private:
+    static std::ofstream& get_output_stream();
+    static const std::string& get_process_id();
+    static size_t get_current_microseconds()
+    {
+        return std::chrono::high_resolution_clock::now().time_since_epoch().count() / 1000;
+    }
+    static std::string get_thread_id();
+    static std::mutex& get_mutex() { return s_file_mutex; }
+    static std::ostream s_ostream;
+    static std::mutex s_file_mutex;
+    static bool s_tracing_enabled;
+};
+class ngraph::runtime::event::Duration
+{
+public:
+    explicit Duration(const std::string& name,
+                      const std::string& category,
+                      const std::string& args = "");
+    ~Duration() { write(); }
+    /// \brief stop the timer without writing the data to the log file. To write the data
+    /// call the `write` method
+    /// Calls to stop() are optional
+    void stop();
+    /// \brief write the log data to the log file for this event
+    /// This funtion has an implicit stop() if stop() has not been previously called
+    void write();
+    Duration(const Duration&) = delete;
+    Duration& operator=(Duration const&) = delete;
+private:
+    std::string to_json() const;
+    size_t m_start;
+    size_t m_stop;
+    std::string m_name;
+    std::string m_category;
+    std::string m_args;
+};
+class ngraph::runtime::event::Object
+{
+public:
+    Object(const std::string& name, const std::string& args);
+    void snapshot(const std::string& args);
+    void destroy();
+private:
+    void write_snapshot(std::ostream& out, const std::string& args);
+    const std::string m_name;
+    size_t m_id;
+};
--- a/src/ngraph/runtime/host_tensor.cpp
+++ b/src/ngraph/runtime/host_tensor.cpp
@@ -18,6 +18,7 @@
 #include <memory>
 #include "ngraph/descriptor/layout/dense_tensor_layout.hpp"
+#include "ngraph/runtime/chrome_trace.hpp"
 #include "ngraph/runtime/host_tensor.hpp"
 #include "ngraph/util.hpp"
@@ -96,6 +97,8 @@ const char* runtime::HostTensor::get_data_ptr() const
 void runtime::HostTensor::write(const void* source, size_t n)
 {
+    runtime::event::Duration d1("write", "HostTensor");
    if (n > m_buffer_size)
    {
        throw out_of_range("write access past end of tensor");
@@ -106,6 +109,7 @@ void runtime::HostTensor::write(const void* source, size_t n)
 void runtime::HostTensor::read(void* target, size_t n) const
 {
+    runtime::event::Duration d1("read", "HostTensor");
    if (n > m_buffer_size)
    {
        throw out_of_range("read access past end of tensor");

--- a/src/ngraph/runtime/interpreter/int_executable.cpp
+++ b/src/ngraph/runtime/interpreter/int_executable.cpp
@@ -30,6 +30,7 @@
 #include "ngraph/pass/manager.hpp"
 #include "ngraph/pass/memory_layout.hpp"
 #include "ngraph/runtime/backend_manager.hpp"
+#include "ngraph/runtime/chrome_trace.hpp"
 #include "ngraph/serializer.hpp"
 #include "ngraph/util.hpp"
@@ -74,6 +75,8 @@ runtime::interpreter::INTExecutable::INTExecutable(const std::string& model_stri
 bool runtime::interpreter::INTExecutable::call(const vector<shared_ptr<runtime::Tensor>>& outputs,
                                               const vector<shared_ptr<runtime::Tensor>>& inputs)
 {
+    runtime::event::Duration d1("call", "Interpreter");
    // convert inputs to HostTensor
    vector<shared_ptr<HostTensor>> func_inputs;
    for (auto tensor : inputs)
@@ -122,6 +125,7 @@ bool runtime::interpreter::INTExecutable::call(const vector<shared_ptr<runtime::
    for (const NodeWrapper& wrapped : m_wrapped_nodes)
    {
        auto op = wrapped.get_node();
+        runtime::event::Duration d2(op->description(), "Interpreter");
        auto type_id = wrapped.get_typeid();
        if (type_id == OP_TYPEID::Parameter)
        {
@@ -304,3 +308,74 @@ void runtime::interpreter::INTExecutable::save(ostream& out)
    string model = serialize(m_function, 0);
    writer.write("model", model.data(), model.size());
 }
+shared_ptr<ngraph::op::Parameter>
+    runtime::interpreter::INTExecutable::get_parameter(size_t index) const
+{
+    const ParameterVector& parameters = get_parameters();
+    NGRAPH_CHECK(index < parameters.size(), "create_tensor for input out of bounds");
+    return parameters[index];
+}
+shared_ptr<ngraph::op::Result> runtime::interpreter::INTExecutable::get_result(size_t index) const
+{
+    const ResultVector& results = get_results();
+    NGRAPH_CHECK(index < results.size(), "create_tensor for input out of bounds");
+    return results[index];
+}
+shared_ptr<runtime::Tensor>
+    runtime::interpreter::INTExecutable::create_input_tensor(size_t input_index)
+{
+    shared_ptr<op::Parameter> parameter = get_parameter(input_index);
+    return make_shared<runtime::HostTensor>(parameter->get_element_type(), parameter->get_shape());
+}
+shared_ptr<runtime::Tensor>
+    runtime::interpreter::INTExecutable::create_output_tensor(size_t output_index)
+{
+    shared_ptr<op::Result> result = get_result(output_index);
+    return make_shared<runtime::HostTensor>(result->get_element_type(), result->get_shape());
+}
+vector<shared_ptr<runtime::Tensor>>
+    runtime::interpreter::INTExecutable::create_input_tensor(size_t input_index,
+                                                             size_t pipeline_depth)
+{
+    vector<shared_ptr<runtime::HostTensor>> tensors;
+    shared_ptr<op::Parameter> parameter = get_parameter(input_index);
+    for (size_t i = 0; i < pipeline_depth; i++)
+    {
+        shared_ptr<runtime::HostTensor> tensor;
+        auto t =
+            make_shared<runtime::HostTensor>(parameter->get_element_type(), parameter->get_shape());
+        tensor = static_pointer_cast<runtime::HostTensor>(t);
+        tensors.push_back(tensor);
+    }
+    vector<shared_ptr<runtime::Tensor>> result_tensors;
+    for (const shared_ptr<runtime::HostTensor>& tensor : tensors)
+    {
+        result_tensors.push_back(tensor);
+    }
+    return result_tensors;
+}
+vector<shared_ptr<runtime::Tensor>>
+    runtime::interpreter::INTExecutable::create_output_tensor(size_t output_index,
+                                                              size_t pipeline_depth)
+{
+    vector<shared_ptr<runtime::HostTensor>> tensors;
+    shared_ptr<op::Result> result = get_result(output_index);
+    for (size_t i = 0; i < pipeline_depth; i++)
+    {
+        shared_ptr<runtime::HostTensor> tensor;
+        auto t = make_shared<runtime::HostTensor>(result->get_element_type(), result->get_shape());
+        tensor = static_pointer_cast<runtime::HostTensor>(t);
+        tensors.push_back(tensor);
+    }
+    vector<shared_ptr<runtime::Tensor>> result_tensors;
+    for (const shared_ptr<runtime::HostTensor>& tensor : tensors)
+    {
+        result_tensors.push_back(tensor);
+    }
+    return result_tensors;
+}
--- a/src/ngraph/runtime/interpreter/int_executable.hpp
+++ b/src/ngraph/runtime/interpreter/int_executable.hpp
@@ -186,9 +186,21 @@ public:
    std::vector<PerformanceCounter> get_performance_data() const override;
+    std::shared_ptr<runtime::Tensor> create_input_tensor(size_t input_index) override;
+    std::shared_ptr<runtime::Tensor> create_output_tensor(size_t output_index) override;
+    std::vector<std::shared_ptr<runtime::Tensor>>
+        create_input_tensor(size_t input_index, size_t pipeline_depth) override;
+    std::vector<std::shared_ptr<runtime::Tensor>>
+        create_output_tensor(size_t output_index, size_t pipeline_depth) override;
 private:
    INTExecutable(const std::string& model_string);
+    std::shared_ptr<ngraph::op::Parameter> get_parameter(size_t index) const;
+    std::shared_ptr<ngraph::op::Result> get_result(size_t index) const;
    int get_alignment() const { return 64; }
    bool m_is_compiled = false;
    bool m_nan_check_enabled = false;

--- a/src/tools/nbench/CMakeLists.txt
+++ b/src/tools/nbench/CMakeLists.txt
@@ -17,6 +17,8 @@
 set (SRC
    nbench.cpp
    benchmark.cpp
+    benchmark_pipelined.cpp
+    benchmark_utils.cpp
 )
 add_executable(nbench ${SRC})

--- a/src/tools/nbench/benchmark.cpp
+++ b/src/tools/nbench/benchmark.cpp
@@ -14,12 +14,8 @@
 // limitations under the License.
 //*****************************************************************************
-#include <random>
-#if defined(__x86_64__) || defined(__amd64__)
-#include <xmmintrin.h>
-#endif
 #include "benchmark.hpp"
+#include "benchmark_utils.hpp"
 #include "ngraph/file_util.hpp"
 #include "ngraph/runtime/backend.hpp"
 #include "ngraph/runtime/host_tensor.hpp"
@@ -30,106 +26,6 @@
 using namespace std;
 using namespace ngraph;
-static default_random_engine s_random_engine;
-void set_denormals_flush_to_zero()
-{
-#if defined(__x86_64__) || defined(__amd64__)
-    // Avoids perf impact from denormals while benchmarking with random data
-    _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
-    _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
-#endif
-}
-template <typename T>
-void init_int_tv(shared_ptr<runtime::Tensor> tv, T min, T max)
-{
-    size_t size = tv->get_element_count();
-    uniform_int_distribution<T> dist(min, max);
-    vector<T> vec(size);
-    for (T& element : vec)
-    {
-        element = dist(s_random_engine);
-    }
-    tv->write(vec.data(), vec.size() * sizeof(T));
-}
-template <>
-void init_int_tv<char>(shared_ptr<runtime::Tensor> tv, char min, char max)
-{
-    size_t size = tv->get_element_count();
-    uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
-    vector<char> vec(size);
-    for (char& element : vec)
-    {
-        element = static_cast<char>(dist(s_random_engine));
-    }
-    tv->write(vec.data(), vec.size() * sizeof(char));
-}
-template <>
-void init_int_tv<int8_t>(shared_ptr<runtime::Tensor> tv, int8_t min, int8_t max)
-{
-    size_t size = tv->get_element_count();
-    uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
-    vector<int8_t> vec(size);
-    for (int8_t& element : vec)
-    {
-        element = static_cast<int8_t>(dist(s_random_engine));
-    }
-    tv->write(vec.data(), vec.size() * sizeof(int8_t));
-}
-template <>
-void init_int_tv<uint8_t>(shared_ptr<runtime::Tensor> tv, uint8_t min, uint8_t max)
-{
-    size_t size = tv->get_element_count();
-    uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
-    vector<uint8_t> vec(size);
-    for (uint8_t& element : vec)
-    {
-        element = static_cast<uint8_t>(dist(s_random_engine));
-    }
-    tv->write(vec.data(), vec.size() * sizeof(uint8_t));
-}
-template <typename T>
-void init_real_tv(shared_ptr<runtime::Tensor> tv, T min, T max)
-{
-    size_t size = tv->get_element_count();
-    uniform_real_distribution<T> dist(min, max);
-    vector<T> vec(size);
-    for (T& element : vec)
-    {
-        element = dist(s_random_engine);
-    }
-    tv->write(vec.data(), vec.size() * sizeof(T));
-}
-static void random_init(shared_ptr<runtime::Tensor> tv)
-{
-    element::Type et = tv->get_element_type();
-    switch (et.get_type_enum())
-    {
-    case element::Type_t::boolean: init_int_tv<char>(tv, 0, 1); break;
-    case element::Type_t::f32: init_real_tv<float>(tv, -1, 1); break;
-    case element::Type_t::f64: init_real_tv<double>(tv, -1, 1); break;
-    case element::Type_t::i8: init_int_tv<int8_t>(tv, -1, 1); break;
-    case element::Type_t::i16: init_int_tv<int16_t>(tv, -1, 1); break;
-    case element::Type_t::i32: init_int_tv<int32_t>(tv, 0, 1); break;
-    case element::Type_t::i64: init_int_tv<int64_t>(tv, 0, 1); break;
-    case element::Type_t::u8: init_int_tv<uint8_t>(tv, 0, 1); break;
-    case element::Type_t::u16: init_int_tv<uint16_t>(tv, 0, 1); break;
-    case element::Type_t::u32: init_int_tv<uint32_t>(tv, 0, 1); break;
-    case element::Type_t::u64: init_int_tv<uint64_t>(tv, 0, 1); break;
-    case element::Type_t::undefined:
-    case element::Type_t::dynamic:
-    case element::Type_t::bf16:
-    case element::Type_t::f16:
-    default: throw runtime_error("unsupported type");
-    }
-}
 vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
                                                  const string& backend_name,
                                                  size_t iterations,
@@ -140,7 +36,7 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
    stopwatch timer;
    timer.start();
    auto backend = runtime::Backend::create(backend_name);
-    auto compiled_func = backend->compile(f, timing_detail);
+    auto exec = backend->compile(f, timing_detail);
    timer.stop();
    cout.imbue(locale(""));
    cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;
@@ -201,7 +97,7 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
                }
            }
        }
-        compiled_func->call(results, args);
+        exec->call(results, args);
        if (copy_data)
        {
            for (size_t result_index = 0; result_index < results.size(); result_index++)
@@ -217,6 +113,6 @@ vector<runtime::PerformanceCounter> run_benchmark(shared_ptr<Function> f,
    float time = t1.get_milliseconds();
    cout << time / iterations << "ms per iteration" << endl;
-    vector<runtime::PerformanceCounter> perf_data = compiled_func->get_performance_data();
+    vector<runtime::PerformanceCounter> perf_data = exec->get_performance_data();
    return perf_data;
 }
--- a/src/tools/nbench/benchmark.hpp
+++ b/src/tools/nbench/benchmark.hpp
@@ -24,10 +24,6 @@
 #include "ngraph/function.hpp"
 #include "ngraph/runtime/performance_counter.hpp"
-/// performance test utilities
-std::multimap<size_t, std::string>
-    aggregate_timing(const std::vector<ngraph::runtime::PerformanceCounter>& perf_data);
 std::vector<ngraph::runtime::PerformanceCounter> run_benchmark(std::shared_ptr<ngraph::Function> f,
                                                               const std::string& backend_name,
                                                               size_t iterations,

--- a/src/tools/nbench/benchmark_pipelined.cpp
+++ b/src/tools/nbench/benchmark_pipelined.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include <array>
+#include <condition_variable>
+#include <mutex>
+#include <thread>
+#include "benchmark.hpp"
+#include "benchmark_utils.hpp"
+#include "ngraph/file_util.hpp"
+#include "ngraph/runtime/backend.hpp"
+#include "ngraph/runtime/host_tensor.hpp"
+#include "ngraph/runtime/tensor.hpp"
+#include "ngraph/serializer.hpp"
+#include "ngraph/util.hpp"
+using namespace std;
+using namespace ngraph;
+class TensorCollection
+{
+public:
+    vector<shared_ptr<runtime::HostTensor>> parameter_data;
+    vector<shared_ptr<runtime::HostTensor>> result_data;
+    vector<shared_ptr<runtime::Tensor>> input_tensors;
+    vector<shared_ptr<runtime::Tensor>> output_tensors;
+private:
+};
+static mutex s_mutex;
+static condition_variable s_condition;
+static size_t current_iteration = 0;
+static size_t s_iterations;
+static size_t s_warmup_iterations;
+static stopwatch s_timer;
+static void
+    thread_entry(runtime::Executable* exec, const TensorCollection& tensors, size_t pipeline_stage)
+{
+    bool data_written = false;
+    const vector<shared_ptr<runtime::Tensor>>& args = tensors.input_tensors;
+    const vector<shared_ptr<runtime::Tensor>>& results = tensors.output_tensors;
+    while (current_iteration < s_iterations + s_warmup_iterations)
+    {
+        if (!data_written)
+        {
+            for (size_t arg_index = 0; arg_index < args.size(); arg_index++)
+            {
+                const shared_ptr<runtime::Tensor>& arg = args[arg_index];
+                if (arg->get_stale())
+                {
+                    const shared_ptr<runtime::HostTensor>& data = tensors.parameter_data[arg_index];
+                    arg->write(data->get_data_ptr(),
+                               data->get_element_count() * data->get_element_type().size());
+                }
+            }
+            data_written = true;
+        }
+        unique_lock<mutex> lock(s_mutex);
+        if ((current_iteration & 1) != pipeline_stage)
+        {
+            s_condition.wait(lock);
+        }
+        else
+        {
+            if (current_iteration == s_warmup_iterations)
+            {
+                s_timer.start();
+            }
+            // our turn to run
+            exec->call(results, args);
+            current_iteration++;
+            data_written = false;
+            s_condition.notify_all();
+            lock.unlock();
+            for (size_t result_index = 0; result_index < results.size(); result_index++)
+            {
+                const shared_ptr<runtime::HostTensor>& data = tensors.result_data[result_index];
+                const shared_ptr<runtime::Tensor>& result = results[result_index];
+                result->read(data->get_data_ptr(),
+                             data->get_element_count() * data->get_element_type().size());
+            }
+        }
+    }
+}
+vector<runtime::PerformanceCounter> run_benchmark_pipelined(shared_ptr<Function> f,
+                                                            const string& backend_name,
+                                                            size_t iterations,
+                                                            bool timing_detail,
+                                                            int warmup_iterations,
+                                                            bool copy_data)
+{
+    constexpr size_t pipeline_depth = 2;
+    s_iterations = iterations;
+    s_warmup_iterations = warmup_iterations;
+    array<TensorCollection, pipeline_depth> tensor_collections;
+    stopwatch timer;
+    timer.start();
+    auto backend = runtime::Backend::create(backend_name);
+    auto exec = backend->compile(f, timing_detail);
+    timer.stop();
+    cout.imbue(locale(""));
+    cout << "compile time: " << timer.get_milliseconds() << "ms" << endl;
+    set_denormals_flush_to_zero();
+    // Create random input data for all input tensors
+    for (size_t i = 0; i < pipeline_depth; i++)
+    {
+        for (shared_ptr<op::Parameter> param : f->get_parameters())
+        {
+            auto tensor_data =
+                make_shared<runtime::HostTensor>(param->get_element_type(), param->get_shape());
+            random_init(tensor_data);
+            tensor_collections[i].parameter_data.push_back(tensor_data);
+        }
+    }
+    // Create output tensors for all outputs
+    for (size_t i = 0; i < pipeline_depth; i++)
+    {
+        for (shared_ptr<Node> result : f->get_results())
+        {
+            auto tensor_data =
+                make_shared<runtime::HostTensor>(result->get_element_type(), result->get_shape());
+            tensor_collections[i].result_data.push_back(tensor_data);
+        }
+    }
+    // Create input tensors for all Parameters
+    array<vector<shared_ptr<runtime::Tensor>>, pipeline_depth> input_tensors_array;
+    size_t input_index = 0;
+    for (shared_ptr<op::Parameter> param : f->get_parameters())
+    {
+        auto input_tensors = exec->create_input_tensor(input_index++, pipeline_depth);
+        for (size_t i = 0; i < pipeline_depth; i++)
+        {
+            tensor_collections[i].input_tensors.push_back(input_tensors[i]);
+        }
+    }
+    // Create output tensors for all Results
+    array<vector<shared_ptr<runtime::Tensor>>, pipeline_depth> output_tensors_array;
+    size_t output_index = 0;
+    for (shared_ptr<Node> result : f->get_results())
+    {
+        auto output_tensors = exec->create_output_tensor(output_index++, pipeline_depth);
+        for (size_t i = 0; i < pipeline_depth; i++)
+        {
+            tensor_collections[i].output_tensors.push_back(output_tensors[i]);
+        }
+    }
+    thread threads[pipeline_depth];
+    for (size_t i = 0; i < pipeline_depth; i++)
+    {
+        threads[i] = thread(thread_entry, exec.get(), tensor_collections[i], i);
+    }
+    for (size_t i = 0; i < pipeline_depth; i++)
+    {
+        threads[i].join();
+    }
+    s_timer.stop();
+    float time = s_timer.get_milliseconds();
+    cout << time / iterations << "ms per iteration" << endl;
+    vector<runtime::PerformanceCounter> perf_data = exec->get_performance_data();
+    return perf_data;
+}
--- a/src/tools/nbench/benchmark_pipelined.hpp
+++ b/src/tools/nbench/benchmark_pipelined.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "ngraph/function.hpp"
+#include "ngraph/runtime/performance_counter.hpp"
+std::vector<ngraph::runtime::PerformanceCounter>
+    run_benchmark_pipelined(std::shared_ptr<ngraph::Function> f,
+                            const std::string& backend_name,
+                            size_t iterations,
+                            bool timing_detail,
+                            int warmup_iterations,
+                            bool copy_data);
--- a/src/tools/nbench/benchmark_utils.cpp
+++ b/src/tools/nbench/benchmark_utils.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#if defined(__x86_64__) || defined(__amd64__)
+#include <xmmintrin.h>
+#endif
+#include "benchmark_utils.hpp"
+#include "ngraph/file_util.hpp"
+#include "ngraph/runtime/backend.hpp"
+#include "ngraph/runtime/host_tensor.hpp"
+#include "ngraph/runtime/tensor.hpp"
+#include "ngraph/serializer.hpp"
+#include "ngraph/util.hpp"
+using namespace std;
+using namespace ngraph;
+template <>
+void init_int_tensor<char>(shared_ptr<runtime::Tensor> tensor, char min, char max)
+{
+    size_t size = tensor->get_element_count();
+    uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
+    vector<char> vec(size);
+    for (char& element : vec)
+    {
+        element = static_cast<char>(dist(get_random_engine()));
+    }
+    tensor->write(vec.data(), vec.size() * sizeof(char));
+}
+template <>
+void init_int_tensor<int8_t>(shared_ptr<runtime::Tensor> tensor, int8_t min, int8_t max)
+{
+    size_t size = tensor->get_element_count();
+    uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
+    vector<int8_t> vec(size);
+    for (int8_t& element : vec)
+    {
+        element = static_cast<int8_t>(dist(get_random_engine()));
+    }
+    tensor->write(vec.data(), vec.size() * sizeof(int8_t));
+}
+template <>
+void init_int_tensor<uint8_t>(shared_ptr<runtime::Tensor> tensor, uint8_t min, uint8_t max)
+{
+    size_t size = tensor->get_element_count();
+    uniform_int_distribution<int16_t> dist(static_cast<short>(min), static_cast<short>(max));
+    vector<uint8_t> vec(size);
+    for (uint8_t& element : vec)
+    {
+        element = static_cast<uint8_t>(dist(get_random_engine()));
+    }
+    tensor->write(vec.data(), vec.size() * sizeof(uint8_t));
+}
+void set_denormals_flush_to_zero()
+{
+#if defined(__x86_64__) || defined(__amd64__)
+    // Avoids perf impact from denormals while benchmarking with random data
+    _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+    _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+#endif
+}
+void random_init(shared_ptr<runtime::Tensor> tensor)
+{
+    element::Type et = tensor->get_element_type();
+#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
+#pragma GCC diagnostic push
+#pragma GCC diagnostic error "-Wswitch"
+#pragma GCC diagnostic error "-Wswitch-enum"
+#endif
+    switch (et.get_type_enum())
+    {
+    case element::Type_t::boolean: init_int_tensor<char>(tensor, 0, 1); break;
+    case element::Type_t::f32: init_real_tensor<float>(tensor, -1, 1); break;
+    case element::Type_t::f64: init_real_tensor<double>(tensor, -1, 1); break;
+    case element::Type_t::i8: init_int_tensor<int8_t>(tensor, -1, 1); break;
+    case element::Type_t::i16: init_int_tensor<int16_t>(tensor, -1, 1); break;
+    case element::Type_t::i32: init_int_tensor<int32_t>(tensor, 0, 1); break;
+    case element::Type_t::i64: init_int_tensor<int64_t>(tensor, 0, 1); break;
+    case element::Type_t::u8: init_int_tensor<uint8_t>(tensor, 0, 1); break;
+    case element::Type_t::u16: init_int_tensor<uint16_t>(tensor, 0, 1); break;
+    case element::Type_t::u32: init_int_tensor<uint32_t>(tensor, 0, 1); break;
+    case element::Type_t::u64: init_int_tensor<uint64_t>(tensor, 0, 1); break;
+    case element::Type_t::undefined:
+    case element::Type_t::dynamic:
+    case element::Type_t::bf16:
+    case element::Type_t::f16:
+    default: throw runtime_error("unsupported type");
+    }
+#if !(defined(__GNUC__) && (__GNUC__ == 4 && __GNUC_MINOR__ == 8))
+#pragma GCC diagnostic pop
+#endif
+}
+default_random_engine& get_random_engine()
+{
+    static std::default_random_engine s_random_engine;
+    return s_random_engine;
+}
--- a/src/tools/nbench/benchmark_utils.hpp
+++ b/src/tools/nbench/benchmark_utils.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include <random>
+#include "benchmark.hpp"
+#include "ngraph/file_util.hpp"
+#include "ngraph/runtime/backend.hpp"
+#include "ngraph/runtime/host_tensor.hpp"
+#include "ngraph/runtime/tensor.hpp"
+#include "ngraph/serializer.hpp"
+#include "ngraph/util.hpp"
+void set_denormals_flush_to_zero();
+void random_init(std::shared_ptr<ngraph::runtime::Tensor> tensor);
+std::default_random_engine& get_random_engine();
+template <typename T>
+void init_int_tensor(std::shared_ptr<ngraph::runtime::Tensor> tensor, T min, T max)
+{
+    size_t size = tensor->get_element_count();
+    std::uniform_int_distribution<T> dist(min, max);
+    std::vector<T> vec(size);
+    for (T& element : vec)
+    {
+        element = dist(get_random_engine());
+    }
+    tensor->write(vec.data(), vec.size() * sizeof(T));
+}
+template <typename T>
+void init_real_tensor(std::shared_ptr<ngraph::runtime::Tensor> tensor, T min, T max)
+{
+    size_t size = tensor->get_element_count();
+    std::uniform_real_distribution<T> dist(min, max);
+    std::vector<T> vec(size);
+    for (T& element : vec)
+    {
+        element = dist(get_random_engine());
+    }
+    tensor->write(vec.data(), vec.size() * sizeof(T));
+}
--- a/src/tools/nbench/nbench.cpp
+++ b/src/tools/nbench/nbench.cpp
@@ -24,6 +24,7 @@
 #include <iomanip>
 #include "benchmark.hpp"
+#include "benchmark_pipelined.hpp"
 #include "ngraph/distributed.hpp"
 #include "ngraph/except.hpp"
 #include "ngraph/file_util.hpp"
@@ -181,6 +182,7 @@ int main(int argc, char** argv)
    int warmup_iterations = 1;
    bool copy_data = true;
    bool dot_file = false;
+    bool double_buffer = false;
    for (size_t i = 1; i < argc; i++)
    {
@@ -229,6 +231,10 @@ int main(int argc, char** argv)
        {
            directory = argv[++i];
        }
+        else if (arg == "--double_buffer")
+        {
+            double_buffer = true;
+        }
        else if (arg == "-w" || arg == "--warmup_iterations")
        {
            try
@@ -283,6 +289,7 @@ OPTIONS
        -w|--warmup_iterations    Number of warm-up iterations
        --no_copy_data            Disable copy of input/result data every iteration
        --dot                     Generate Graphviz dot file
+        --double_buffer           Double buffer inputs and outputs
 )###";
        return 1;
    }
@@ -420,8 +427,17 @@ OPTIONS
            {
                cout << "\n---- Benchmark ----\n";
                shared_ptr<Function> f = deserialize(model);
-                auto perf_data = run_benchmark(
+                vector<runtime::PerformanceCounter> perf_data;
-                    f, backend, iterations, timing_detail, warmup_iterations, copy_data);
+                if (double_buffer)
+                {
+                    perf_data = run_benchmark_pipelined(
+                        f, backend, iterations, timing_detail, warmup_iterations, copy_data);
+                }
+                else
+                {
+                    perf_data = run_benchmark(
+                        f, backend, iterations, timing_detail, warmup_iterations, copy_data);
+                }
                auto perf_shape = to_perf_shape(f, perf_data);
                aggregate_perf_data.insert(
                    aggregate_perf_data.end(), perf_shape.begin(), perf_shape.end());

--- a/test/backend/batch_norm.in.cpp
+++ b/test/backend/batch_norm.in.cpp
@@ -733,8 +733,8 @@ NGRAPH_TEST(${BACKEND_NAME}, batch_norm_bprop_n4c3h2w2)
    auto C = std::make_shared<op::Parameter>(element::f32, shape_r);
    auto zero = ngraph::make_zero(bn_dgamma->get_element_type(), bn_dgamma->get_shape());
-    ngraph::autodiff::Adjoints adjoints(NodeVector{bn_dx, bn_dgamma, bn_dbeta},
+    ngraph::autodiff::Adjoints adjoints(OutputVector{bn_dx, bn_dgamma, bn_dbeta},
-                                        NodeVector{C, zero, zero});
+                                        OutputVector{C, zero, zero});
    auto dinput = adjoints.backprop_node(input);
    auto dgamma = adjoints.backprop_node(gamma);

--- a/test/backend/binary_elementwise.in.cpp
+++ b/test/backend/binary_elementwise.in.cpp
@@ -257,10 +257,10 @@ NGRAPH_TEST(${BACKEND_NAME}, divide_adjoint_stability)
        auto B = make_shared<op::Parameter>(element::f32, shape);
        auto f = make_shared<Function>(make_shared<op::Divide>(A, B), ParameterVector{A, B});
-        auto Y_out = f->get_output_op(0);
+        auto Y_out = f->output(0);
        auto Xs = f->get_parameters();
-        auto C = std::make_shared<op::Parameter>(Y_out->get_element_type(), Y_out->get_shape());
+        auto C = std::make_shared<op::Parameter>(Y_out.get_element_type(), Y_out.get_shape());
-        ngraph::autodiff::Adjoints adjoints(NodeVector{Y_out}, NodeVector{C});
+        ngraph::autodiff::Adjoints adjoints(OutputVector{Y_out}, OutputVector{C});
        std::vector<std::shared_ptr<Node>> dYdXs(Xs.size());
        transform(
            Xs.begin(), Xs.end(), dYdXs.begin(), [C, &adjoints](const std::shared_ptr<Node>& X) {

--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp
@@ -507,7 +507,8 @@ TEST(cpu_fusion, conv_bias_bprop_n1c1h3w3)
    auto f = make_shared<Function>(
        convolution_bias, ParameterVector{conv_test.data, conv_test.weights, conv_test.bias});
-    ngraph::autodiff::Adjoints adjoints(NodeVector{convolution_bias}, NodeVector{conv_test.delta});
+    ngraph::autodiff::Adjoints adjoints(OutputVector{convolution_bias},
+                                        OutputVector{conv_test.delta});
    auto d_data = adjoints.backprop_node(conv_test.data);
    auto d_weights = adjoints.backprop_node(conv_test.weights);
@@ -546,7 +547,7 @@ TEST(cpu_fusion, conv_bias_bprop)
    pass_manager.register_pass<pass::VisualizeTree>("conv_bias_bprop_fusion.png");
    auto f = make_shared<Function>(conv_bias, ParameterVector{data_batch, filters, bias});
-    ngraph::autodiff::Adjoints adjoints(NodeVector{conv_bias}, NodeVector{delta});
+    ngraph::autodiff::Adjoints adjoints(OutputVector{conv_bias}, OutputVector{delta});
    auto d_data = adjoints.backprop_node(data_batch);
    auto d_weights = adjoints.backprop_node(filters);
@@ -1452,7 +1453,7 @@ TEST(cpu_fusion, max_pool_with_indices)
    auto max_pool = std::make_shared<op::MaxPool>(input, window_shape);
    auto C = std::make_shared<op::Parameter>(element::f32, max_pool->get_shape());
-    ngraph::autodiff::Adjoints adjoints(NodeVector{max_pool}, NodeVector{C});
+    ngraph::autodiff::Adjoints adjoints(ngraph::OutputVector{max_pool}, ngraph::OutputVector{C});
    auto dinput = adjoints.backprop_node(input);
@@ -1789,14 +1790,14 @@ static std::shared_ptr<ngraph::Function> make_forward_function()
    return std::make_shared<Function>(NodeVector{max_pool, neg, absn}, ParameterVector{input});
 }
-static std::pair<std::shared_ptr<ngraph::Function>, std::vector<std::shared_ptr<ngraph::Node>>>
+static std::pair<std::shared_ptr<ngraph::Function>, OutputVector>
    make_backward_function(std::shared_ptr<ngraph::Function> f)
 {
    // get parameters
    std::vector<std::shared_ptr<ngraph::op::Parameter>> back_parameters = f->get_parameters();
-    ngraph::NodeVector adjoints;
+    ngraph::OutputVector adjoints;
-    ngraph::NodeVector outputs;
+    ngraph::OutputVector outputs;
    for (auto Y : f->get_results())
    {
        // Get the output
@@ -1809,7 +1810,7 @@ static std::pair<std::shared_ptr<ngraph::Function>, std::vector<std::shared_ptr<
    ngraph::autodiff::Adjoints adjoint{outputs, adjoints};
    // Perform autodiff
-    std::vector<std::shared_ptr<Node>> dYdXs(back_parameters.size());
+    OutputVector dYdXs(back_parameters.size());
    transform(back_parameters.begin(),
              back_parameters.end(),
              dYdXs.begin(),
@@ -1818,7 +1819,8 @@ static std::pair<std::shared_ptr<ngraph::Function>, std::vector<std::shared_ptr<
    // create the backward function
    std::vector<std::shared_ptr<ngraph::op::Parameter>> param_adjoints;
    for (auto n : adjoints)
-        param_adjoints.push_back(std::dynamic_pointer_cast<ngraph::op::Parameter>(n));
+        param_adjoints.push_back(
+            std::dynamic_pointer_cast<ngraph::op::Parameter>(n.get_node_shared_ptr()));
    back_parameters.insert(back_parameters.begin(), param_adjoints.begin(), param_adjoints.end());
    return {std::make_shared<ngraph::Function>(dYdXs, back_parameters), adjoints};
@@ -2703,7 +2705,7 @@ void sigmoid_multiply_fusion_backward_compute(runtime::Backend* backend,
    auto sigmoid_mul =
        make_shared<op::SigmoidMultiply>(input_0_alt, input_1_alt, input_0_type, input_1_type);
-    ngraph::autodiff::Adjoints adjoints(NodeVector{sigmoid_mul}, NodeVector{delta_param});
+    ngraph::autodiff::Adjoints adjoints(OutputVector{sigmoid_mul}, OutputVector{delta_param});
    auto d_input_0 = adjoints.backprop_node(input_0_adjoint);
    auto d_input_1 = adjoints.backprop_node(input_1_adjoint);
    auto df = make_shared<Function>(NodeVector{d_input_0, d_input_1}, back_params);

--- a/test/util/autodiff/backprop_derivative.hpp
+++ b/test/util/autodiff/backprop_derivative.hpp
@@ -144,7 +144,7 @@ namespace ngraph
            // df/dX*
            std::vector<std::shared_ptr<Node>> df_output_params;
-            Adjoints adjoints(NodeVector{f->get_output_op(0)}, NodeVector{c_param});
+            Adjoints adjoints(OutputVector{f->output(0)}, OutputVector{c_param});
            // for each x "of interest"
            for (auto x : indep_params)

--- a/test/util/autodiff/backprop_function.cpp
+++ b/test/util/autodiff/backprop_function.cpp
@@ -32,10 +32,10 @@ using namespace ngraph;
 std::shared_ptr<Function> autodiff::backprop_function(const std::shared_ptr<Function>& f)
 {
-    auto Y_out = f->get_output_op(0);
+    auto Y_out = f->output(0);
    auto Xs = f->get_parameters();
-    auto C = std::make_shared<op::Parameter>(Y_out->get_element_type(), Y_out->get_shape());
+    auto C = std::make_shared<op::Parameter>(Y_out.get_element_type(), Y_out.get_shape());
-    Adjoints adjoints(NodeVector{Y_out}, NodeVector{C});
+    Adjoints adjoints(OutputVector{Y_out}, OutputVector{C});
    std::vector<std::shared_ptr<Node>> dYdXs(Xs.size());
    transform(Xs.begin(), Xs.end(), dYdXs.begin(), [C, &adjoints](const std::shared_ptr<Node>& X) {
        return adjoints.backprop_node(X);