Merge branch 'master' into tfl/gpu_framework_codegen

9ede40cc · Robert Kimball · GitHub · ba9a2a25 · 3380928c · 9ede40cc
Unverified Commit 9ede40cc authored Feb 21, 2018 by Robert Kimball Committed by GitHub Feb 21, 2018
41 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -58,6 +58,7 @@ nervana_aeon.egg-info/
 # vim
 *.swp
 *.swo
+tags
 build/
 scripts/

--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -18,3 +18,4 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIN_NGRAPH_LIBRARY")
 add_subdirectory(resource)
 add_subdirectory(ngraph)
+add_subdirectory(tools)
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -76,6 +76,7 @@ set (SRC
    ops/sum.cpp
    ops/tan.cpp
    ops/tanh.cpp
+    ops/util/arithmetic_reduction.cpp
    ops/util/binary_elementwise_arithmetic.cpp 
    ops/util/binary_elementwise_comparison.cpp 
    ops/util/binary_elementwise.cpp 
@@ -175,6 +176,8 @@ if (NGRAPH_CPU_ENABLE AND LLVM_INCLUDE_DIR AND
        runtime/cpu/cpu_tensor_view_wrapper.cpp
        runtime/cpu/cpu_layout_descriptor.cpp
        runtime/cpu/cpu_tracing.cpp
+        runtime/cpu/mkldnn_emitter.cpp
+        runtime/cpu/mkldnn_invoke.cpp
        runtime/cpu/mkldnn_utils.cpp
        runtime/cpu/ops/convert_layout.cpp
        runtime/cpu/ops/matmul_bias.cpp

--- a/src/ngraph/ngraph.hpp
+++ b/src/ngraph/ngraph.hpp
@@ -90,8 +90,10 @@
 #include "ngraph/ops/less.hpp"
 #include "ngraph/ops/less_eq.hpp"
 #include "ngraph/ops/log.hpp"
+#include "ngraph/ops/max.hpp"
 #include "ngraph/ops/max_pool.hpp"
 #include "ngraph/ops/maximum.hpp"
+#include "ngraph/ops/min.hpp"
 #include "ngraph/ops/minimum.hpp"
 #include "ngraph/ops/multiply.hpp"
 #include "ngraph/ops/negative.hpp"
@@ -102,6 +104,7 @@
 #include "ngraph/ops/pad.hpp"
 #include "ngraph/ops/parameter.hpp"
 #include "ngraph/ops/power.hpp"
+#include "ngraph/ops/product.hpp"
 #include "ngraph/ops/reduce.hpp"
 #include "ngraph/ops/reduce_window.hpp"
 #include "ngraph/ops/remainder.hpp"

--- a/src/ngraph/ops/max.hpp
+++ b/src/ngraph/ops/max.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include "ngraph/ops/util/arithmetic_reduction.hpp"
+namespace ngraph
+{
+    namespace op
+    {
+        /// \brief Max-reduction operation.
+        ///
+        /// Reduces the tensor, eliminating the specified reduction axes by taking the maximum element.
+        ///
+        /// This is equivalent to Reduce where `arg_init` = -inf and `reduction_function` is \f$f(x,y) = max(x,y)\f$.
+        ///
+        /// ## Parameters
+        ///
+        /// |                      | Description                                  |
+        /// | -------------------- | -------------------------------------------- |
+        /// | `reduction_axes`     | The axes to eliminate through max-reduction. |
+        ///
+        /// ## Inputs
+        ///
+        /// |       | Type                              | Description                                            |
+        /// | ----- | --------------------------------- | ------------------------------------------------------ |
+        /// | `arg` | \f$N[d_1,\dots,d_n]~(n \geq 0)\f$ | An input tensor of any shape and numeric element type. |
+        ///
+        /// ## Output
+        ///
+        /// | Type                                      | Description                                                                                                                       |
+        /// | ----------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------- |
+        /// | \f$N[\textit{delete}(A,d_1,\dots,d_n)]\f$ | The tensor \f$T\f$, where \f$T\f$ is the input tensor with the `reduction_axes` \f$A\f$ eliminated by taking the maximum element. |
+        class Max : public util::ArithmeticReduction
+        {
+        public:
+            /// \brief Constructs a max-reduction operation.
+            ///
+            /// \param arg The tensor view to be reduced.
+            /// \param reduction_axes The axis positions (0-based) to be eliminated.
+            Max(const std::shared_ptr<Node>& arg, const AxisSet& reduction_axes)
+                : ArithmeticReduction("Max", arg, reduction_axes)
+            {
+            }
+            virtual std::shared_ptr<Node> copy_with_new_args(
+                const std::vector<std::shared_ptr<Node>>& new_args) const override
+            {
+                if (new_args.size() != 1)
+                {
+                    throw ngraph_error("Incorrect number of new arguments");
+                }
+                return std::make_shared<Max>(new_args.at(0), m_reduction_axes);
+            }
+        };
+    }
+}
--- a/src/ngraph/ops/min.hpp
+++ b/src/ngraph/ops/min.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include "ngraph/ops/util/arithmetic_reduction.hpp"
+namespace ngraph
+{
+    namespace op
+    {
+        /// \brief Min-reduction operation.
+        ///
+        /// Reduces the tensor, eliminating the specified reduction axes by taking the minimum element.
+        ///
+        /// This is equivalent to Reduce where `arg_init` = -inf and `reduction_function` is \f$f(x,y) = min(x,y)\f$.
+        ///
+        /// ## Parameters
+        ///
+        /// |                      | Description                                  |
+        /// | -------------------- | -------------------------------------------- |
+        /// | `reduction_axes`     | The axes to eliminate through min-reduction. |
+        ///
+        /// ## Inputs
+        ///
+        /// |       | Type                              | Description                                            |
+        /// | ----- | --------------------------------- | ------------------------------------------------------ |
+        /// | `arg` | \f$N[d_1,\dots,d_n]~(n \geq 0)\f$ | An input tensor of any shape and numeric element type. |
+        ///
+        /// ## Output
+        ///
+        /// | Type                                      | Description                                                                                                                       |
+        /// | ----------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------- |
+        /// | \f$N[\textit{delete}(A,d_1,\dots,d_n)]\f$ | The tensor \f$T\f$, where \f$T\f$ is the input tensor with the `reduction_axes` \f$A\f$ eliminated by taking the minimum element. |
+        class Min : public util::ArithmeticReduction
+        {
+        public:
+            /// \brief Constructs a min-reduction operation.
+            ///
+            /// \param arg The tensor view to be reduced.
+            /// \param reduction_axes The axis positions (0-based) to be eliminated.
+            Min(const std::shared_ptr<Node>& arg, const AxisSet& reduction_axes)
+                : ArithmeticReduction("Min", arg, reduction_axes)
+            {
+            }
+            virtual std::shared_ptr<Node> copy_with_new_args(
+                const std::vector<std::shared_ptr<Node>>& new_args) const override
+            {
+                if (new_args.size() != 1)
+                {
+                    throw ngraph_error("Incorrect number of new arguments");
+                }
+                return std::make_shared<Min>(new_args.at(0), m_reduction_axes);
+            }
+        };
+    }
+}
--- a/src/ngraph/ops/product.hpp
+++ b/src/ngraph/ops/product.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include "ngraph/ops/util/arithmetic_reduction.hpp"
+namespace ngraph
+{
+    namespace op
+    {
+        /// \brief Product reduction operation.
+        ///
+        /// Reduces the tensor, eliminating the specified reduction axes by taking the product.
+        ///
+        /// \f[
+        ///     \mathit{product}\left(\{0\},
+        ///         \left[ \begin{array}{ccc}
+        ///                1 & 2 \\
+        ///                3 & 4 \\
+        ///                5 & 6 \end{array} \right]\right) =
+        ///     \left[ (1 * 3 * 5), (2 * 4 * 6) \right] =
+        ///     \left[ 15, 48 \right]~~~\text{(dimension 0 (rows) is eliminated)}
+        /// \f]
+        ///
+        /// \f[
+        ///     \mathit{product}\left(\{1\},
+        ///         \left[ \begin{array}{ccc}
+        ///                1 & 2 \\
+        ///                3 & 4 \\
+        ///                5 & 6 \end{array} \right]\right) =
+        ///     \left[ (1 * 2), (3 * 4), (5 * 6) \right] =
+        ///     \left[ 2, 12, 30 \right]~~~\text{(dimension 1 (columns) is eliminated)}
+        /// \f]
+        ///
+        /// \f[
+        ///     \mathit{product}\left(\{0,1\},
+        ///         \left[ \begin{array}{ccc}
+        ///                1 & 2 \\
+        ///                3 & 4 \\
+        ///                5 & 6 \end{array} \right]\right) =
+        ///      (1 * 2) * (3 * 4) * (5 * 6) =
+        ///      720~~~\text{(both dimensions (rows and columns) are eliminated)}
+        /// \f]
+        ///
+        /// This is equivalent to Reduce where `arg_init` = 1 and `reduction_function` is \f$f(x,y) = x*y\f$.
+        ///
+        /// ## Parameters
+        ///
+        /// |                      | Description                            |
+        /// | -------------------- | -------------------------------------- |
+        /// | `reduction_axes`     | The axes to eliminate through product. |
+        ///
+        /// ## Inputs
+        ///
+        /// |       | Type                              | Description                                            |
+        /// | ----- | --------------------------------- | ------------------------------------------------------ |
+        /// | `arg` | \f$N[d_1,\dots,d_n]~(n \geq 0)\f$ | An input tensor of any shape and numeric element type. |
+        ///
+        /// ## Output
+        ///
+        /// | Type                                      | Description                                                                                                    |
+        /// | ----------------------------------------- | -------------------------------------------------------------------------------------------------------------- |
+        /// | \f$N[\textit{delete}(A,d_1,\dots,d_n)]\f$ | The tensor \f$T\f$, where \f$T\f$ is the input tensor with the `reduction_axes` \f$A\f$ eliminated by product. |
+        class Product : public util::ArithmeticReduction
+        {
+        public:
+            /// \brief Constructs a product reduction operation.
+            ///
+            /// \param arg The tensor view to be reduced.
+            /// \param reduction_axes The axis positions (0-based) to be eliminated.
+            Product(const std::shared_ptr<Node>& arg, const AxisSet& reduction_axes)
+                : ArithmeticReduction("Product", arg, reduction_axes)
+            {
+            }
+            virtual std::shared_ptr<Node> copy_with_new_args(
+                const std::vector<std::shared_ptr<Node>>& new_args) const override
+            {
+                if (new_args.size() != 1)
+                {
+                    throw ngraph_error("Incorrect number of new arguments");
+                }
+                return std::make_shared<Product>(new_args.at(0), m_reduction_axes);
+            }
+        };
+    }
+}
--- a/src/ngraph/ops/sum.cpp
+++ b/src/ngraph/ops/sum.cpp
@@ -15,46 +15,11 @@
 *******************************************************************************/
 #include "ngraph/ops/sum.hpp"
-#include "ngraph/function.hpp"
 #include "ngraph/ops/broadcast.hpp"
 using namespace std;
 using namespace ngraph;
-op::Sum::Sum(const std::shared_ptr<Node>& arg, const AxisSet& reduction_axes)
-    : RequiresTensorViewArgs("Sum", {arg})
-    , m_reduction_axes(reduction_axes)
-{
-    auto& input = get_inputs().at(0);
-    auto& input_element_type = input.get_element_type();
-    if (input_element_type == element::boolean)
-    {
-        throw ngraph_error("Argument for sum must have numeric element type");
-    }
-    auto input_shape = input.get_shape();
-    for (auto axis : m_reduction_axes)
-    {
-        if (axis >= input_shape.size())
-        {
-            throw ngraph_error("Reduction axis for sum is out of bounds");
-        }
-    }
-    Shape result_shape;
-    for (size_t i = 0; i < input_shape.size(); i++)
-    {
-        if (m_reduction_axes.count(i) == 0)
-        {
-            result_shape.push_back(input_shape.at(i));
-        }
-    }
-    set_value_type_checked(input.get_element_type(), result_shape);
-}
 void op::Sum::generate_adjoints(autodiff::Adjoints& adjoints, const std::shared_ptr<Node>& delta)
 {
    auto x = get_inputs().at(0).get_output().get_node();

--- a/src/ngraph/ops/sum.hpp
+++ b/src/ngraph/ops/sum.hpp
@@ -16,7 +16,7 @@
 #pragma once
-#include "ngraph/ops/util/requires_tensor_view_args.hpp"
+#include "ngraph/ops/util/arithmetic_reduction.hpp"
 namespace ngraph
 {
@@ -76,14 +76,17 @@ namespace ngraph
        /// | Type                                      | Description                                                                                                      |
        /// | ----------------------------------------- | ---------------------------------------------------------------------------------------------------------------- |
        /// | \f$N[\textit{delete}(A,d_1,\dots,d_n)]\f$ | The tensor \f$T\f$, where \f$T\f$ is the input tensor with the `reduction_axes` \f$A\f$ eliminated by summation. |
-        class Sum : public util::RequiresTensorViewArgs
+        class Sum : public util::ArithmeticReduction
        {
        public:
            /// \brief Constructs a summation operation.
            ///
            /// \param arg The tensor view to be summed.
            /// \param reduction_axes The axis positions (0-based) to be eliminated.
-            Sum(const std::shared_ptr<Node>& arg, const AxisSet& reduction_axes);
+            Sum(const std::shared_ptr<Node>& arg, const AxisSet& reduction_axes)
+                : ArithmeticReduction("Sum", arg, reduction_axes)
+            {
+            }
            virtual std::shared_ptr<Node> copy_with_new_args(
                const std::vector<std::shared_ptr<Node>>& new_args) const override
@@ -95,13 +98,9 @@ namespace ngraph
                return std::make_shared<Sum>(new_args.at(0), m_reduction_axes);
            }
-            /// \return The axis positions (0-based) to be eliminated through summation.
-            const AxisSet& get_reduction_axes() const { return m_reduction_axes; }
        protected:
            virtual void generate_adjoints(autodiff::Adjoints& adjoints,
                                           const std::shared_ptr<Node>& delta) override;
-            AxisSet m_reduction_axes;
        };
    }
 }
--- a/src/ngraph/ops/util/arithmetic_reduction.cpp
+++ b/src/ngraph/ops/util/arithmetic_reduction.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "ngraph/ops/util/arithmetic_reduction.hpp"
+using namespace std;
+using namespace ngraph;
+op::util::ArithmeticReduction::ArithmeticReduction(const std::string& node_type,
+                                                   const std::shared_ptr<Node>& arg,
+                                                   const AxisSet& reduction_axes)
+    : RequiresTensorViewArgs(node_type, {arg})
+    , m_reduction_axes(reduction_axes)
+{
+    auto& input = get_inputs().at(0);
+    auto input_shape = input.get_shape();
+    for (auto axis : m_reduction_axes)
+    {
+        if (axis >= input_shape.size())
+        {
+            throw ngraph_error("Reduction axis for arithmetic reduction operator is out of bounds");
+        }
+    }
+    Shape result_shape;
+    for (size_t i = 0; i < input_shape.size(); i++)
+    {
+        if (m_reduction_axes.count(i) == 0)
+        {
+            result_shape.push_back(input_shape.at(i));
+        }
+    }
+    set_value_type_checked(input.get_element_type(), result_shape);
+}
--- a/src/ngraph/ops/util/arithmetic_reduction.hpp
+++ b/src/ngraph/ops/util/arithmetic_reduction.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include "ngraph/ops/util/requires_tensor_view_args.hpp"
+namespace ngraph
+{
+    namespace op
+    {
+        namespace util
+        {
+            /// \brief Abstract base class for arithmetic reduction operations, i.e., operations where chosen axes of the input tensors
+            ///        are eliminated (reduced out) by repeated application of a particular binary arithmetic operation.
+            class ArithmeticReduction : public RequiresTensorViewArgs
+            {
+            public:
+                /// \brief Constructs an arithmetic reduction operation.
+                ///
+                /// \param arg Node that produces the first input tensor.
+                /// \param reduction_axes The axis positions (0-based) to be eliminated.
+                ArithmeticReduction(const std::string& node_type,
+                                    const std::shared_ptr<Node>& arg,
+                                    const AxisSet& reduction_axes);
+                /// \return The axis positions (0-based) to be eliminated through reduction.
+                const AxisSet& get_reduction_axes() const { return m_reduction_axes; }
+            protected:
+                AxisSet m_reduction_axes;
+            };
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/cpu_call_frame.cpp
+++ b/src/ngraph/runtime/cpu/cpu_call_frame.cpp
@@ -142,6 +142,8 @@ void runtime::cpu::CPU_CallFrame::setup_runtime_context()
    {
        ctx->op_durations = new int64_t[m_external_function->get_op_attrs().size()];
    }
+    const auto& mkldnn_emitter = m_external_function->get_mkldnn_emitter();
+    ctx->mkldnn_primitives = mkldnn_emitter->get_mkldnn_primitives().data();
 }
 void runtime::cpu::CPU_CallFrame::cleanup_runtime_context()

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -61,8 +61,10 @@
 #include "ngraph/ops/less.hpp"
 #include "ngraph/ops/less_eq.hpp"
 #include "ngraph/ops/log.hpp"
+#include "ngraph/ops/max.hpp"
 #include "ngraph/ops/max_pool.hpp"
 #include "ngraph/ops/maximum.hpp"
+#include "ngraph/ops/min.hpp"
 #include "ngraph/ops/minimum.hpp"
 #include "ngraph/ops/multiply.hpp"
 #include "ngraph/ops/negative.hpp"
@@ -73,6 +75,7 @@
 #include "ngraph/ops/pad.hpp"
 #include "ngraph/ops/parameter.hpp"
 #include "ngraph/ops/power.hpp"
+#include "ngraph/ops/product.hpp"
 #include "ngraph/ops/reduce.hpp"
 #include "ngraph/ops/reduce_window.hpp"
 #include "ngraph/ops/remainder.hpp"
@@ -223,6 +226,9 @@ static const runtime::cpu::OpMap dispatcher{
    {TI(ngraph::op::Pad), &runtime::cpu::CPU_Emitter::emit<op::Pad>},
    {TI(ngraph::op::BatchNorm), &runtime::cpu::CPU_Emitter::emit<op::BatchNorm>},
    {TI(ngraph::op::MaxPoolBackprop), &runtime::cpu::CPU_Emitter::emit<op::MaxPoolBackprop>},
+    {TI(ngraph::op::Product), &runtime::cpu::CPU_Emitter::emit<op::Product>},
+    {TI(ngraph::op::Max), &runtime::cpu::CPU_Emitter::emit<op::Max>},
+    {TI(ngraph::op::Min), &runtime::cpu::CPU_Emitter::emit<op::Min>},
 };
 runtime::cpu::CPU_ExternalFunction::CPU_ExternalFunction(
@@ -243,6 +249,8 @@ void runtime::cpu::CPU_ExternalFunction::compile()
    string function_name = m_function->get_name();
+    m_mkldnn_emitter.reset(new MKLDNNEmitter(shared_from_this()));
    ngraph::pass::Manager pass_manager;
    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
@@ -278,15 +286,19 @@ void runtime::cpu::CPU_ExternalFunction::compile()
 #include "ngraph/runtime/cpu/cpu_eigen_utils.hpp"
 #include "ngraph/runtime/cpu/cpu_kernels.hpp"
 #include "ngraph/runtime/cpu/cpu_runtime_context.hpp"
+#include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
 #include "ngraph/runtime/kernel/avg_pool.hpp"
 #include "ngraph/runtime/kernel/broadcast.hpp"
 #include "ngraph/runtime/kernel/concat.hpp"
 #include "ngraph/runtime/kernel/convolution.hpp"
 #include "ngraph/runtime/kernel/dot.hpp"
+#include "ngraph/runtime/kernel/max.hpp"
 #include "ngraph/runtime/kernel/max_pool.hpp"
+#include "ngraph/runtime/kernel/min.hpp"
 #include "ngraph/runtime/kernel/not.hpp"
 #include "ngraph/runtime/kernel/one_hot.hpp"
 #include "ngraph/runtime/kernel/pad.hpp"
+#include "ngraph/runtime/kernel/product.hpp"
 #include "ngraph/runtime/kernel/reduce.hpp"
 #include "ngraph/runtime/kernel/reduce_window.hpp"
 #include "ngraph/runtime/kernel/replace_slice.hpp"

--- a/src/ngraph/runtime/cpu/cpu_external_function.hpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.hpp
@@ -31,6 +31,7 @@
 #include "ngraph/runtime/cpu/cpu_call_frame.hpp"
 #include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
 #include "ngraph/runtime/cpu/cpu_tensor_view_wrapper.hpp"
+#include "ngraph/runtime/cpu/mkldnn_emitter.hpp"
 #include "ngraph/runtime/external_function.hpp"
 namespace ngraph
@@ -80,6 +81,11 @@ namespace ngraph
                const LayoutDescriptorPtrs& get_result_layout_descriptors();
                const std::vector<OpAttributes>& get_op_attrs() const { return m_op_attrs; }
+                const std::unique_ptr<MKLDNNEmitter>& get_mkldnn_emitter() const
+                {
+                    return m_mkldnn_emitter;
+                }
            protected:
                void compile();
@@ -115,6 +121,8 @@ namespace ngraph
                LayoutDescriptorPtrs parameter_layout_descriptors;
                LayoutDescriptorPtrs result_layout_descriptors;
                std::vector<OpAttributes> m_op_attrs;
+                std::unique_ptr<MKLDNNEmitter> m_mkldnn_emitter;
            };
        }
    }

--- a/src/ngraph/runtime/cpu/cpu_runtime_context.hpp
+++ b/src/ngraph/runtime/cpu/cpu_runtime_context.hpp
@@ -17,6 +17,11 @@
 #include <chrono>
 #include <cstdint>
+namespace mkldnn
+{
+    class primitive;
+}
 namespace ngraph
 {
    namespace runtime
@@ -31,6 +36,7 @@ namespace ngraph
            struct CPURuntimeContext
            {
                int64_t* op_durations;
+                mkldnn::primitive* const* mkldnn_primitives;
            };
            }
        }

--- a/src/ngraph/runtime/cpu/cpu_tensor_view.cpp
+++ b/src/ngraph/runtime/cpu/cpu_tensor_view.cpp
@@ -56,7 +56,16 @@ runtime::cpu::CPUTensorView::CPUTensorView(const ngraph::element::Type& element_
            throw ngraph_error("Error allocating CPU Tensor View memory");
        }
        buffer = static_cast<char*>(ptr);
+// GCC major versions below 5 do not implement C++11 std::align
+#if !defined(__GNUC__) || __GNUC__ >= 5
        std::align(BufferAlignment, buffer_size, ptr, allocation_size);
+#else
+        ptr = static_cast<char*>(ptr) + (BufferAlignment - 1);
+        ptr = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(ptr) &
+                                      ~(uintptr_t(BufferAlignment - 1)));
+#endif
        aligned_buffer = static_cast<char*>(ptr);
    }
 }

--- a/src/ngraph/runtime/cpu/cpu_tensor_view_wrapper.cpp
+++ b/src/ngraph/runtime/cpu/cpu_tensor_view_wrapper.cpp
@@ -69,3 +69,9 @@ bool runtime::cpu::TensorViewWrapper::is_output() const
 {
    return m_tensor_view->get_tensor().is_output();
 }
+const std::shared_ptr<descriptor::TensorView>
+    runtime::cpu::TensorViewWrapper::get_tensor_view() const
+{
+    return m_tensor_view;
+}
--- a/src/ngraph/runtime/cpu/cpu_tensor_view_wrapper.hpp
+++ b/src/ngraph/runtime/cpu/cpu_tensor_view_wrapper.hpp
@@ -45,6 +45,7 @@ public:
    const std::string& get_name() const;
    const std::string& get_type() const;
    bool is_output() const;
+    const std::shared_ptr<descriptor::TensorView> get_tensor_view() const;
 private:
    std::shared_ptr<descriptor::TensorView> m_tensor_view;

--- a/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <memory>
+#include "mkldnn_emitter.hpp"
+#include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
+#include "ngraph/runtime/cpu/cpu_tensor_view_wrapper.hpp"
+#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+using namespace ngraph::runtime::cpu;
+const std::vector<mkldnn::primitive*>& MKLDNNEmitter::get_mkldnn_primitives() const
+{
+    return mkldnn_primitives;
+}
+size_t MKLDNNEmitter::insert_primitive(mkldnn::primitive* primitive)
+{
+    mkldnn_primitives.emplace_back(primitive);
+    return (mkldnn_primitives.size() - 1);
+}
+const std::vector<size_t>& MKLDNNEmitter::get_primitive_deps(size_t index) const
+{
+    return primitive_deps.at(index);
+}
+mkldnn::memory::desc MKLDNNEmitter::build_memory_descriptor(const TensorViewWrapper& tvw,
+                                                            mkldnn::memory::format fmt) const
+{
+    return mkldnn::memory::desc(
+        mkldnn::memory::dims(tvw.get_shape().begin(), tvw.get_shape().end()),
+        mkldnn_utils::GetDataType(tvw.get_element_type()),
+        fmt);
+}
+mkldnn::memory::desc MKLDNNEmitter::build_memory_descriptor(const TensorViewWrapper& tvw) const
+{
+    auto layout =
+        std::static_pointer_cast<LayoutDescriptor>(tvw.get_tensor_view()->get_tensor_view_layout());
+    return build_memory_descriptor(tvw, layout->get_mkldnn_format());
+}
+mkldnn::memory MKLDNNEmitter::build_memory_primitive(const TensorViewWrapper& tvw) const
+{
+    return mkldnn::memory({build_memory_descriptor(tvw), mkldnn_utils::global_cpu_engine}, nullptr);
+}
+size_t MKLDNNEmitter::build_memory_primitive(const mkldnn::memory::desc& desc)
+{
+    // The MKL-DNN C++ API forces proper initialization of a memory primitive
+    // with a non-null pointer (unlike the C API)
+    // Primitives are initialized at runtime so we use a known-invalid address here
+    // to bypass this check
+    return insert_primitive(
+        new mkldnn::memory({desc, mkldnn_utils::global_cpu_engine}, reinterpret_cast<void*>(0x42)));
+}
+size_t MKLDNNEmitter::build_convolution_forward(const mkldnn::memory::desc& input_data_desc,
+                                                const mkldnn::memory::desc& weights_desc,
+                                                const mkldnn::memory::desc& result_desc,
+                                                const ngraph::Strides& strides,
+                                                const ngraph::CoordinateDiff& padding_below,
+                                                const ngraph::CoordinateDiff& padding_above)
+{
+    size_t input_data_index = build_memory_primitive(input_data_desc);
+    size_t weights_index = build_memory_primitive(weights_desc);
+    size_t result_index = build_memory_primitive(result_desc);
+    size_t conv_index = insert_primitive(new mkldnn::convolution_forward(
+        {{mkldnn::prop_kind::forward,
+          mkldnn::algorithm::convolution_direct,
+          input_data_desc,
+          weights_desc,
+          result_desc,
+          mkldnn::memory::dims(strides.begin(), strides.end()),
+          mkldnn::memory::dims(padding_below.begin(), padding_below.end()),
+          mkldnn::memory::dims(padding_above.begin(), padding_above.end()),
+          mkldnn::padding_kind::zero},
+         mkldnn_utils::global_cpu_engine},
+        *mkldnn_primitives[input_data_index],
+        *mkldnn_primitives[weights_index],
+        *mkldnn_primitives[result_index]));
+    primitive_deps[conv_index] = {input_data_index, weights_index, result_index};
+    return conv_index;
+}
+size_t MKLDNNEmitter::build_convolution_forward(const mkldnn::memory::desc& input_data_desc,
+                                                const mkldnn::memory::desc& weights_desc,
+                                                const mkldnn::memory::desc& result_desc,
+                                                const ngraph::Strides& strides,
+                                                const ngraph::Strides& dilation_strides,
+                                                const ngraph::CoordinateDiff& padding_below,
+                                                const ngraph::CoordinateDiff& padding_above)
+{
+    size_t input_data_index = build_memory_primitive(input_data_desc);
+    size_t weights_index = build_memory_primitive(weights_desc);
+    size_t result_index = build_memory_primitive(result_desc);
+    size_t conv_index = insert_primitive(new mkldnn::convolution_forward(
+        {{mkldnn::prop_kind::forward,
+          mkldnn::algorithm::convolution_direct,
+          input_data_desc,
+          weights_desc,
+          result_desc,
+          mkldnn::memory::dims(strides.begin(), strides.end()),
+          mkldnn::memory::dims(dilation_strides.begin(), dilation_strides.end()),
+          mkldnn::memory::dims(padding_below.begin(), padding_below.end()),
+          mkldnn::memory::dims(padding_above.begin(), padding_above.end()),
+          mkldnn::padding_kind::zero},
+         mkldnn_utils::global_cpu_engine},
+        *mkldnn_primitives[input_data_index],
+        *mkldnn_primitives[weights_index],
+        *mkldnn_primitives[result_index]));
+    primitive_deps[conv_index] = {input_data_index, weights_index, result_index};
+    return conv_index;
+}
--- a/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include <memory>
+#include <unordered_map>
+#include <vector>
+#include <mkldnn.hpp>
+#include "ngraph/common.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            class CPU_ExternalFunction;
+            class TensorViewWrapper;
+            class MKLDNNEmitter
+            {
+            public:
+                MKLDNNEmitter(std::shared_ptr<CPU_ExternalFunction> ef)
+                    : external_function(ef)
+                {
+                }
+                const std::vector<mkldnn::primitive*>& get_mkldnn_primitives() const;
+                size_t insert_primitive(mkldnn::primitive* primitive);
+                const std::vector<size_t>& get_primitive_deps(size_t index) const;
+                // TODO(jmenon): Get rid of TensorViewWrappers at some point
+                mkldnn::memory::desc build_memory_descriptor(const TensorViewWrapper& tvw,
+                                                             mkldnn::memory::format fmt) const;
+                mkldnn::memory::desc build_memory_descriptor(const TensorViewWrapper& tvw) const;
+                mkldnn::memory build_memory_primitive(const TensorViewWrapper& tvw) const;
+                size_t build_memory_primitive(const mkldnn::memory::desc& desc);
+                size_t build_convolution_forward(const mkldnn::memory::desc& input_data_desc,
+                                                 const mkldnn::memory::desc& weights_desc,
+                                                 const mkldnn::memory::desc& result_desc,
+                                                 const ngraph::Strides& strides,
+                                                 const ngraph::CoordinateDiff& padding_below,
+                                                 const ngraph::CoordinateDiff& padding_above);
+                size_t build_convolution_forward(const mkldnn::memory::desc& input_data_desc,
+                                                 const mkldnn::memory::desc& weights_desc,
+                                                 const mkldnn::memory::desc& result_desc,
+                                                 const ngraph::Strides& strides,
+                                                 const ngraph::Strides& dilation_strides,
+                                                 const ngraph::CoordinateDiff& padding_below,
+                                                 const ngraph::CoordinateDiff& padding_above);
+            private:
+                std::shared_ptr<CPU_ExternalFunction> external_function;
+                std::vector<mkldnn::primitive*> mkldnn_primitives;
+                std::vector<mkldnn::stream> mkldnn_streams;
+                std::unordered_map<size_t, std::vector<size_t>> primitive_deps;
+            };
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/mkldnn_invoke.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_invoke.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <mkldnn.hpp>
+#include "mkldnn_invoke.hpp"
+#include "ngraph/runtime/cpu/cpu_runtime_context.hpp"
+#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+mkldnn::engine ngraph::runtime::cpu::mkldnn_utils::global_cpu_engine(mkldnn::engine::cpu, 0);
+extern "C" void ngraph::runtime::cpu::mkldnn_utils::set_memory_ptr(CPURuntimeContext* ctx,
+                                                                   size_t primitive_index,
+                                                                   void* ptr)
+{
+    auto primitive = static_cast<mkldnn::memory*>(ctx->mkldnn_primitives[primitive_index]);
+    primitive->set_data_handle(ptr);
+}
+extern "C" void ngraph::runtime::cpu::mkldnn_utils::mkldnn_invoke_primitive(CPURuntimeContext* ctx,
+                                                                            size_t primitive_index)
+{
+    mkldnn::stream s(mkldnn::stream::kind::eager);
+    s.submit({*ctx->mkldnn_primitives[primitive_index]}).wait();
+}
--- a/src/ngraph/runtime/cpu/mkldnn_invoke.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_invoke.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include <cstddef>
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            struct CPURuntimeContext;
+            namespace mkldnn_utils
+            {
+                extern "C" void
+                    set_memory_ptr(CPURuntimeContext* ctx, size_t primitive_index, void* ptr);
+                extern "C" void mkldnn_invoke_primitive(CPURuntimeContext* ctx,
+                                                        size_t primitive_index);
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/mkldnn_utils.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.cpp
@@ -17,6 +17,7 @@
 #include <string>
 #include <typeindex>
 #include <typeinfo>
+#include <unordered_map>
 #include <unordered_set>
 #include "ngraph/node.hpp"
@@ -37,14 +38,37 @@ namespace ngraph
            {
 #define TI(x) std::type_index(typeid(x))
-                const std::unordered_set<std::type_index> s_op_registry{
+                static const std::unordered_set<std::type_index> s_op_registry{
                    TI(ngraph::op::AvgPool),
                    TI(ngraph::op::AvgPoolBackprop),
+                    TI(ngraph::op::BatchNorm),
                    TI(ngraph::op::Convolution),
                    TI(ngraph::op::ConvolutionBackpropData),
                    TI(ngraph::op::ConvolutionBackpropFilters),
                    TI(ngraph::op::MaxPool),
-                    TI(ngraph::op::BatchNorm)};
+                    TI(ngraph::op::MaxPoolBackprop)};
+                static const std::unordered_map<std::string, const mkldnn::memory::data_type>
+                    s_data_type_map{{"char", mkldnn::memory::data_type::s8},
+                                    {"float", mkldnn::memory::data_type::f32},
+                                    {"double", mkldnn::memory::data_type::data_undef},
+                                    {"int8_t", mkldnn::memory::data_type::s8},
+                                    {"int16_t", mkldnn::memory::data_type::s16},
+                                    {"int32_t", mkldnn::memory::data_type::s32},
+                                    {"int64_t", mkldnn::memory::data_type::data_undef},
+                                    {"uint8_t", mkldnn::memory::data_type::u8},
+                                    {"uint16_t", mkldnn::memory::data_type::data_undef},
+                                    {"uint32_t", mkldnn::memory::data_type::data_undef},
+                                    {"uint64_t", mkldnn::memory::data_type::data_undef}};
+                mkldnn::memory::data_type GetDataType(const ngraph::element::Type& et)
+                {
+                    auto it = s_data_type_map.find(et.c_type_string());
+                    if (it == s_data_type_map.end() ||
+                        it->second == mkldnn::memory::data_type::data_undef)
+                        throw ngraph_error("No MKLDNN data type exists for the given element type");
+                    return it->second;
+                }
                bool IsMKLDNNOp(ngraph::Node& op)
                {

--- a/src/ngraph/runtime/cpu/mkldnn_utils.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.hpp
@@ -16,15 +16,11 @@
 #pragma once
-#include <string>
-#include <typeindex>
-#include <typeinfo>
-#include <unordered_set>
 #include <mkldnn.hpp>
 #include "ngraph/node.hpp"
 #include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
+#include "ngraph/types/element_type.hpp"
 namespace ngraph
 {
@@ -34,7 +30,12 @@ namespace ngraph
        {
            namespace mkldnn_utils
            {
+                extern mkldnn::engine global_cpu_engine;
+                mkldnn::memory::data_type GetDataType(const ngraph::element::Type& et);
                bool IsMKLDNNOp(ngraph::Node& op);
                mkldnn::memory::format
                    CreateNativeDataFormat(const ngraph::runtime::cpu::LayoutDescriptor& layout);
            }

--- a/src/ngraph/runtime/interpreter/int_call_frame.hpp
+++ b/src/ngraph/runtime/interpreter/int_call_frame.hpp
@@ -29,9 +29,12 @@
 #include "ngraph/ops/constant.hpp"
 #include "ngraph/ops/convolution.hpp"
 #include "ngraph/ops/dot.hpp"
+#include "ngraph/ops/max.hpp"
 #include "ngraph/ops/max_pool.hpp"
+#include "ngraph/ops/min.hpp"
 #include "ngraph/ops/one_hot.hpp"
 #include "ngraph/ops/pad.hpp"
+#include "ngraph/ops/product.hpp"
 #include "ngraph/ops/reduce.hpp"
 #include "ngraph/ops/reduce_window.hpp"
 #include "ngraph/ops/replace_slice.hpp"
@@ -67,8 +70,10 @@
 #include "ngraph/runtime/kernel/less.hpp"
 #include "ngraph/runtime/kernel/less_eq.hpp"
 #include "ngraph/runtime/kernel/log.hpp"
+#include "ngraph/runtime/kernel/max.hpp"
 #include "ngraph/runtime/kernel/max_pool.hpp"
 #include "ngraph/runtime/kernel/maximum.hpp"
+#include "ngraph/runtime/kernel/min.hpp"
 #include "ngraph/runtime/kernel/minimum.hpp"
 #include "ngraph/runtime/kernel/multiply.hpp"
 #include "ngraph/runtime/kernel/negate.hpp"
@@ -77,6 +82,7 @@
 #include "ngraph/runtime/kernel/one_hot.hpp"
 #include "ngraph/runtime/kernel/pad.hpp"
 #include "ngraph/runtime/kernel/power.hpp"
+#include "ngraph/runtime/kernel/product.hpp"
 #include "ngraph/runtime/kernel/reduce.hpp"
 #include "ngraph/runtime/kernel/reduce_window.hpp"
 #include "ngraph/runtime/kernel/replace_slice.hpp"
@@ -489,6 +495,15 @@ private:
                           reinterpret_cast<T*>(out[0]->get_data_ptr()),
                           out[0]->get_element_count());
        }
+        else if (node_op == "Max")
+        {
+            const op::Max* max = static_cast<const op::Max*>(&node);
+            kernel::max<T>(reinterpret_cast<T*>(args[0]->get_data_ptr()),
+                           reinterpret_cast<T*>(out[0]->get_data_ptr()),
+                           args[0]->get_shape(),
+                           out[0]->get_shape(),
+                           max->get_reduction_axes());
+        }
        else if (node_op == "Maximum")
        {
            kernel::maximum<T>(reinterpret_cast<T*>(args[0]->get_data_ptr()),
@@ -524,6 +539,15 @@ private:
                                         max_pool_backprop->get_padding_below(),
                                         max_pool_backprop->get_padding_above());
        }
+        else if (node_op == "Min")
+        {
+            const op::Min* min = static_cast<const op::Min*>(&node);
+            kernel::min<T>(reinterpret_cast<T*>(args[0]->get_data_ptr()),
+                           reinterpret_cast<T*>(out[0]->get_data_ptr()),
+                           args[0]->get_shape(),
+                           out[0]->get_shape(),
+                           min->get_reduction_axes());
+        }
        else if (node_op == "Minimum")
        {
            kernel::minimum<T>(reinterpret_cast<T*>(args[0]->get_data_ptr()),
@@ -589,6 +613,15 @@ private:
                             reinterpret_cast<T*>(out[0]->get_data_ptr()),
                             out[0]->get_element_count());
        }
+        else if (node_op == "Product")
+        {
+            const op::Product* product = static_cast<const op::Product*>(&node);
+            kernel::product<T>(reinterpret_cast<T*>(args[0]->get_data_ptr()),
+                               reinterpret_cast<T*>(out[0]->get_data_ptr()),
+                               args[0]->get_shape(),
+                               out[0]->get_shape(),
+                               product->get_reduction_axes());
+        }
        else if (node_op == "Reduce")
        {
            ngraph::op::Reduce* reduce = dynamic_cast<ngraph::op::Reduce*>(&node);

--- a/src/ngraph/runtime/kernel/max.hpp
+++ b/src/ngraph/runtime/kernel/max.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include <cmath>
+#include <limits>
+#include "ngraph/common.hpp"
+#include "ngraph/coordinate_transform.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace kernel
+        {
+            template <typename T>
+            void max(T* arg,
+                     T* out,
+                     const Shape& in_shape,
+                     const Shape& out_shape,
+                     const AxisSet& reduction_axes)
+            {
+                T minval = std::numeric_limits<T>::has_infinity
+                               ? -std::numeric_limits<T>::infinity()
+                               : std::numeric_limits<T>::min();
+                CoordinateTransform output_transform(out_shape);
+                for (const Coordinate& output_coord : output_transform)
+                {
+                    out[output_transform.index(output_coord)] = minval;
+                }
+                CoordinateTransform input_transform(in_shape);
+                for (const Coordinate& input_coord : input_transform)
+                {
+                    Coordinate output_coord = project_coordinate(input_coord, reduction_axes);
+                    T x = arg[input_transform.index(input_coord)];
+                    T max = out[output_transform.index(output_coord)];
+                    if (x > max)
+                    {
+                        out[output_transform.index(output_coord)] = x;
+                    }
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/kernel/min.hpp
+++ b/src/ngraph/runtime/kernel/min.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include <cmath>
+#include <limits>
+#include "ngraph/common.hpp"
+#include "ngraph/coordinate_transform.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace kernel
+        {
+            template <typename T>
+            void min(T* arg,
+                     T* out,
+                     const Shape& in_shape,
+                     const Shape& out_shape,
+                     const AxisSet& reduction_axes)
+            {
+                T minval = std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity()
+                                                                : std::numeric_limits<T>::max();
+                CoordinateTransform output_transform(out_shape);
+                for (const Coordinate& output_coord : output_transform)
+                {
+                    out[output_transform.index(output_coord)] = minval;
+                }
+                CoordinateTransform input_transform(in_shape);
+                for (const Coordinate& input_coord : input_transform)
+                {
+                    Coordinate output_coord = project_coordinate(input_coord, reduction_axes);
+                    T x = arg[input_transform.index(input_coord)];
+                    T min = out[output_transform.index(output_coord)];
+                    if (x < min)
+                    {
+                        out[output_transform.index(output_coord)] = x;
+                    }
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/kernel/product.hpp
+++ b/src/ngraph/runtime/kernel/product.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include <cmath>
+#include "ngraph/common.hpp"
+#include "ngraph/coordinate_transform.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace kernel
+        {
+            template <typename T>
+            void product(T* arg,
+                         T* out,
+                         const Shape& in_shape,
+                         const Shape& out_shape,
+                         const AxisSet& reduction_axes)
+            {
+                CoordinateTransform output_transform(out_shape);
+                for (const Coordinate& output_coord : output_transform)
+                {
+                    out[output_transform.index(output_coord)] = 1;
+                }
+                CoordinateTransform input_transform(in_shape);
+                for (const Coordinate& input_coord : input_transform)
+                {
+                    Coordinate output_coord = project_coordinate(input_coord, reduction_axes);
+                    out[output_transform.index(output_coord)] *=
+                        arg[input_transform.index(input_coord)];
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/serializer.cpp
+++ b/src/ngraph/serializer.cpp
@@ -43,8 +43,10 @@
 #include "ngraph/ops/less.hpp"
 #include "ngraph/ops/less_eq.hpp"
 #include "ngraph/ops/log.hpp"
+#include "ngraph/ops/max.hpp"
 #include "ngraph/ops/max_pool.hpp"
 #include "ngraph/ops/maximum.hpp"
+#include "ngraph/ops/min.hpp"
 #include "ngraph/ops/minimum.hpp"
 #include "ngraph/ops/multiply.hpp"
 #include "ngraph/ops/negative.hpp"
@@ -53,6 +55,7 @@
 #include "ngraph/ops/one_hot.hpp"
 #include "ngraph/ops/pad.hpp"
 #include "ngraph/ops/power.hpp"
+#include "ngraph/ops/product.hpp"
 #include "ngraph/ops/reduce.hpp"
 #include "ngraph/ops/reduce_window.hpp"
 #include "ngraph/ops/remainder.hpp"
@@ -552,6 +555,11 @@ static shared_ptr<ngraph::Function>
        {
            node = make_shared<op::Log>(args[0]);
        }
+        else if (node_op == "Max")
+        {
+            auto reduction_axes = node_js.at("reduction_axes").get<set<size_t>>();
+            node = make_shared<op::Max>(args[0], reduction_axes);
+        }
        else if (node_op == "MaxPool")
        {
            auto window_shape = node_js.at("window_shape").get<vector<size_t>>();
@@ -601,6 +609,11 @@ static shared_ptr<ngraph::Function>
        {
            node = make_shared<op::Maximum>(args[0], args[1]);
        }
+        else if (node_op == "Min")
+        {
+            auto reduction_axes = node_js.at("reduction_axes").get<set<size_t>>();
+            node = make_shared<op::Min>(args[0], reduction_axes);
+        }
        else if (node_op == "Minimum")
        {
            node = make_shared<op::Minimum>(args[0], args[1]);
@@ -647,6 +660,11 @@ static shared_ptr<ngraph::Function>
        {
            node = make_shared<op::Power>(args[0], args[1]);
        }
+        else if (node_op == "Product")
+        {
+            auto reduction_axes = node_js.at("reduction_axes").get<set<size_t>>();
+            node = make_shared<op::Product>(args[0], reduction_axes);
+        }
        else if (node_op == "Reduce")
        {
            auto reduction_axes = node_js.at("reduction_axes").get<set<size_t>>();
@@ -951,6 +969,11 @@ static json write(const Node& n)
    else if (node_op == "Log")
    {
    }
+    else if (node_op == "Max")
+    {
+        auto tmp = dynamic_cast<const op::Max*>(&n);
+        node["reduction_axes"] = tmp->get_reduction_axes();
+    }
    else if (node_op == "MaxPool")
    {
        auto tmp = dynamic_cast<const op::MaxPool*>(&n);
@@ -970,6 +993,11 @@ static json write(const Node& n)
    else if (node_op == "Maximum")
    {
    }
+    else if (node_op == "Min")
+    {
+        auto tmp = dynamic_cast<const op::Min*>(&n);
+        node["reduction_axes"] = tmp->get_reduction_axes();
+    }
    else if (node_op == "Minimum")
    {
    }
@@ -1004,6 +1032,11 @@ static json write(const Node& n)
        node["shape"] = tmp->get_shape();
        node["element_type"] = write_element_type(tmp->get_element_type());
    }
+    else if (node_op == "Product")
+    {
+        auto tmp = dynamic_cast<const op::Product*>(&n);
+        node["reduction_axes"] = tmp->get_reduction_axes();
+    }
    else if (node_op == "Power")
    {
    }

--- a/src/tools/CMakeLists.txt
+++ b/src/tools/CMakeLists.txt
+# ******************************************************************************
+# Copyright 2017-2018 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ******************************************************************************
+if(MKLDNN_INCLUDE_DIR)
+    link_directories(${MKLDNN_LIB_DIR})
+endif()
+if (NGRAPH_CPU_ENABLE)
+    set (SRC
+        nbench.cpp
+        ${PROJECT_SOURCE_DIR}/test/util/benchmark.cpp
+    )
+    add_executable(nbench ${SRC})
+    add_dependencies(nbench ngraph)
+    set(HEADER_SEARCH_DEFINES
+        "NGRAPH_HEADERS_PATH=\"${NGRAPH_INCLUDE_PATH}\""
+    )
+    target_link_libraries(nbench ngraph)
+    set_source_files_properties(nbench.cpp PROPERTIES COMPILE_DEFINITIONS "${HEADER_SEARCH_DEFINES}")
+endif()
--- a/src/tools/nbench.cpp
+++ b/src/tools/nbench.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+// tool to benchmark any ngraph json model with given backend.
+// compile and run with:
+// g++ ./nbench.cpp -std=c++11 -I$HOME/ngraph_dist/include -L$HOME/ngraph_dist/lib -lngraph -o nbench
+// env LD_LIBRARY_PATH=$HOME/ngraph_dist/lib env NGRAPH_INTERPRETER_EMIT_TIMING=1 ./nbench
+// sample models are under ../../test/models
+#include <fstream>
+#include <ngraph/runtime/backend.hpp>
+#include <ngraph/runtime/call_frame.hpp>
+#include <ngraph/runtime/manager.hpp>
+#include "../../test/util/benchmark.hpp"
+#include "../../test/util/test_tools.hpp"
+using namespace std;
+int main(int argc, char** argv)
+{
+    string model = "model.json";
+    string backend = "INTERPRETER";
+    int iter = 10;
+    bool failed = false;
+    for (size_t i = 1; i < argc; i++)
+    {
+        if (string(argv[i]) == "-f")
+        {
+            model = argv[++i];
+        }
+        else if (string(argv[i]) == "-b")
+        {
+            backend = argv[++i];
+        }
+        else if (string(argv[i]) == "-i")
+        {
+            try
+            {
+                iter = stoi(argv[++i]);
+            }
+            catch (...)
+            {
+                cout << "Invalid Argument\n";
+                failed = true;
+            }
+        }
+    }
+    if (!static_cast<bool>(ifstream(model)))
+    {
+        cout << "File " << model << " not found\n";
+        failed = true;
+    }
+    if (failed)
+    {
+        cout << R"###(
+DESCRIPTION                                                         
+    Benchmark ngraph json model with given backend.                 
+SYNOPSIS                                                            
+        nbench [-f <filename>] [-b <backend>] [-i <iterations>]
+OPTIONS                                                             
+        -f          model json file to use (default: model.json)    
+        -b          Backend to use (default: INTERPRETER)           
+        -i          Iterations (default: 10)                        
+)###";
+        return 1;
+    }
+    cout << "Benchmarking " << model << ", " << backend << " backend, " << iter << " iterations.\n";
+    run_benchmark(model, backend, iter);
+}
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -50,6 +50,7 @@ set (SRC
    type_prop.cpp
    util/autodiff/backprop_function.cpp
    util/test_tools.cpp
+    util/benchmark.cpp
    util.cpp
    uuid.cpp
 )

--- a/test/autodiff.in.cpp
+++ b/test/autodiff.in.cpp
@@ -1455,3 +1455,88 @@ TEST(${BACKEND_NAME}, backwards_reverse_3d_02)
    };
    EXPECT_TRUE(autodiff_numeric_compare<float>(manager, backend, make_graph, {x}, .01f, .01f));
 }
+TEST(${BACKEND_NAME}, backwards_maxpool_n4c1h4w4_kh2kw2_sh1sw1)
+{
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto backend = manager->allocate_backend();
+    Shape shape_a{4, 1, 4, 4}; //in NCHW
+    Shape maxpool_shape{4, 1, 3, 3};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape window_shape{2, 2};
+    auto window_movement_strides = Strides{1, 1};
+    auto maxpool = make_shared<op::MaxPool>(A, window_shape, window_movement_strides);
+    auto f = make_shared<Function>(maxpool, op::Parameters{A});
+    shared_ptr<runtime::TensorView> ep =
+        backend->make_primary_tensor_view(element::f32, maxpool_shape);
+    vector<float> dataEp(shape_size(maxpool_shape), 4);
+    shared_ptr<runtime::TensorView> input =
+        backend->make_primary_tensor_view(element::f32, shape_a);
+    shared_ptr<runtime::TensorView> output =
+        backend->make_primary_tensor_view(element::f32, shape_a);
+    vector<float> dataInput{11, 65, 44, 28, 31, 33, 21, 66, 40, 49, 69, 57, 47, 30, 24, 27,
+                            13, 56, 46, 60, 61, 41, 25, 42, 48, 53, 51, 43, 59, 58, 29, 71,
+                            17, 22, 72, 18, 39, 35, 15, 38, 64, 52, 73, 67, 62, 50, 10, 68,
+                            45, 63, 16, 14, 55, 54, 37, 20, 36, 12, 70, 34, 19, 26, 32, 23};
+    vector<float> expected{//delta
+                           0, 8, 0, 0, 0, 0, 0, 4, 0, 8, 16, 0, 0, 0, 0,  0, 0, 4, 0, 4, 8,  0,
+                           0, 0, 0, 4, 4, 0, 4, 4, 0, 4, 0,  0, 8, 0, 4,  0, 0, 0, 8, 0, 16, 0,
+                           0, 0, 0, 0, 0, 8, 0, 0, 4, 0, 4,  0, 4, 0, 16, 0, 0, 0, 0, 0};
+    copy_data(ep, dataEp);
+    copy_data(input, dataInput);
+    auto C = make_shared<op::Parameter>(element::f32, maxpool_shape);
+    auto df = autodiff::backprop_function(f);
+    auto external = manager->compile(df);
+    auto cf = backend->make_call_frame(external);
+    cf->tensor_call({input, ep}, {output});
+    ASSERT_TRUE(read_vector<float>(output) == expected);
+}
+TEST(${BACKEND_NAME}, backwards_maxpool_n2c1h5w5_kh3kw3_sh2sw2)
+{
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto backend = manager->allocate_backend();
+    Shape shape_a{1, 2, 5, 5}; //in NCHW
+    Shape maxpool_shape{1, 2, 2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape window_shape{3, 3};
+    auto window_movement_strides = Strides{2, 2};
+    auto maxpool = make_shared<op::MaxPool>(A, window_shape, window_movement_strides);
+    auto f = make_shared<Function>(maxpool, op::Parameters{A});
+    shared_ptr<runtime::TensorView> ep =
+        backend->make_primary_tensor_view(element::f32, maxpool_shape);
+    vector<float> dataEp(shape_size(maxpool_shape), 4);
+    shared_ptr<runtime::TensorView> input =
+        backend->make_primary_tensor_view(element::f32, shape_a);
+    shared_ptr<runtime::TensorView> output =
+        backend->make_primary_tensor_view(element::f32, shape_a);
+    vector<float> dataInput{58, 15, 51, 35, 18, 47, 31, 32, 52, 21, 36, 38, 57, 54, 25, 45, 23,
+                            30, 16, 27, 48, 20, 41, 37, 43, 39, 22, 28, 33, 29, 12, 17, 44, 42,
+                            19, 40, 10, 46, 34, 53, 26, 55, 50, 13, 24, 14, 49, 56, 59, 11};
+    vector<float> expected{//delta
+                           4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 0, 0, 0,
+                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0, 0, 0, 0,
+                           0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 0, 0, 0,  4, 4, 0};
+    copy_data(ep, dataEp);
+    copy_data(input, dataInput);
+    auto C = make_shared<op::Parameter>(element::f32, maxpool_shape);
+    auto df = autodiff::backprop_function(f);
+    auto external = manager->compile(df);
+    auto cf = backend->make_call_frame(external);
+    cf->tensor_call({input, ep}, {output});
+    ASSERT_TRUE(read_vector<float>(output) == expected);
+}
\ No newline at end of file
--- a/test/backend_performance.cpp
+++ b/test/backend_performance.cpp
@@ -31,92 +31,13 @@
 #include "ngraph/runtime/manager.hpp"
 #include "ngraph/serializer.hpp"
 #include "ngraph/util.hpp"
+#include "util/benchmark.hpp"
 #include "util/random.hpp"
 #include "util/test_tools.hpp"
 using namespace std;
 using namespace ngraph;
-static multimap<size_t, string>
-    agregate_timing(const vector<runtime::PerformanceCounter>& perf_data)
-{
-    unordered_map<string, size_t> timing;
-    for (const runtime::PerformanceCounter& p : perf_data)
-    {
-        string op = p.name().substr(0, p.name().find('_'));
-        timing[op] += p.microseconds();
-    }
-    multimap<size_t, string> rc;
-    for (const pair<string, size_t>& t : timing)
-    {
-        rc.insert({t.second, t.first});
-    }
-    return rc;
-}
-void run_benchmark(const string& json_path, const string& backend_name, size_t iterations)
-{
-    string env_var_name = "NGRAPH_" + backend_name + "_EMIT_TIMING";
-    bool emit_timing = (std::getenv(env_var_name.c_str()) != nullptr);
-    if (!emit_timing)
-    {
-        cout << "To get per-op timing set the environment variable " << env_var_name << "\n";
-    }
-    test::Uniform<float> rng{-1, 1, 0};
-    const string json_string = file_util::read_file_to_string(json_path);
-    stringstream ss(json_string);
-    shared_ptr<Function> f = ngraph::deserialize(ss);
-    stopwatch build_time;
-    build_time.start();
-    auto manager = runtime::Manager::get(backend_name);
-    auto external = manager->compile(f);
-    auto backend = manager->allocate_backend();
-    auto cf = backend->make_call_frame(external);
-    build_time.stop();
-    cout << "build_time " << build_time.get_milliseconds() << "ms" << endl;
-    vector<shared_ptr<runtime::TensorView>> args;
-    for (shared_ptr<op::Parameter> param : f->get_parameters())
-    {
-        auto tensor =
-            backend->make_primary_tensor_view(param->get_element_type(), param->get_shape());
-        rng.initialize(tensor);
-        args.push_back(tensor);
-    }
-    vector<shared_ptr<runtime::TensorView>> results;
-    for (shared_ptr<Node> out : f->get_results())
-    {
-        auto result = backend->make_primary_tensor_view(out->get_element_type(), out->get_shape());
-        results.push_back(result);
-    }
-    stopwatch t1;
-    t1.start();
-    for (size_t i = 0; i < static_cast<size_t>(iterations); i++)
-    {
-        cf->tensor_call(args, results);
-    }
-    t1.stop();
-    float time = t1.get_milliseconds();
-    cout << time / iterations << "ms per iteration" << endl;
-    vector<runtime::PerformanceCounter> perf_data = cf->get_performance_data();
-    sort(perf_data.begin(),
-         perf_data.end(),
-         [](const runtime::PerformanceCounter& p1, const runtime::PerformanceCounter& p2) {
-             return p1.total_microseconds() > p2.total_microseconds();
-         });
-    multimap<size_t, string> timing = agregate_timing(perf_data);
-    for (auto it = timing.rbegin(); it != timing.rend(); it++)
-    {
-        cout.imbue(locale(""));
-        cout << setw(15) << left << it->second << " " << setw(10) << right << it->first << "us\n";
-    }
-}
 TEST(benchmark, mxnet_mnist_mlp_forward)
 {
    const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/mnist_mlp_forward.json");

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
--- a/test/pattern.cpp
+++ b/test/pattern.cpp
@@ -546,4 +546,4 @@ TEST(pattern, variance)
    auto var_graph = construct_variance_graph();
    ASSERT_TRUE(n.match(var_graph, variance));
    ASSERT_EQ(n.get_pattern_map()[var_graph], variance);
 }
\ No newline at end of file
--- a/test/type_prop.cpp
+++ b/test/type_prop.cpp
@@ -631,7 +631,7 @@ TEST(type_prop, reduce_nonscalar)
    {
        auto r0 = make_shared<op::Reduce>(param_0, param_1, f, AxisSet{0});
        // Should have thrown, so fail if it didn't
-        FAIL() << "Did not detect incorrect element types for arithmetic operator";
+        FAIL() << "Did not detect non-scalar initial value for reduce";
    }
    catch (const ngraph_error& error)
    {
@@ -656,7 +656,7 @@ TEST(type_prop, reduce_elem_type_mismatch)
    {
        auto r0 = make_shared<op::Reduce>(param_0, param_1, f, AxisSet{0});
        // Should have thrown, so fail if it didn't
-        FAIL() << "Did not detect incorrect element types for arithmetic operator";
+        FAIL() << "Did not detect element type mismatch for reduce";
    }
    catch (const ngraph_error& error)
    {
@@ -816,7 +816,7 @@ TEST(type_prop, reduce_axis_oob)
    {
        auto r = make_shared<op::Reduce>(param_0, param_1, f, AxisSet{0, 2, 1});
        // Should have thrown, so fail if it didn't
-        FAIL() << "Did not detect incorrect element types for arithmetic operator";
+        FAIL() << "Did not detect out-of-bound axis for reduce";
    }
    catch (const ngraph_error& error)
    {
@@ -6048,3 +6048,45 @@ TEST(type_prop, pad_deduce_interior_padding_wrong_rank)
        FAIL() << "Deduced type check failed for unexpected reason";
    }
 }
+TEST(type_prop, sum_deduce)
+{
+    auto param_0 = make_shared<op::Parameter>(element::f32, Shape{2, 4});
+    auto r0 = make_shared<op::Sum>(param_0, AxisSet{0});
+    ASSERT_EQ(r0->get_element_type(), element::f32);
+    ASSERT_EQ(r0->get_shape(), (Shape{4}));
+    auto r1 = make_shared<op::Sum>(param_0, AxisSet{1});
+    ASSERT_EQ(r1->get_element_type(), element::f32);
+    ASSERT_EQ(r1->get_shape(), (Shape{2}));
+    auto r01 = make_shared<op::Sum>(param_0, AxisSet{0, 1});
+    ASSERT_EQ(r01->get_element_type(), element::f32);
+    ASSERT_EQ(r01->get_shape(), (Shape{}));
+    auto r_none = make_shared<op::Sum>(param_0, AxisSet{});
+    ASSERT_EQ(r_none->get_element_type(), element::f32);
+    ASSERT_EQ(r_none->get_shape(), (Shape{2, 4}));
+}
+TEST(type_prop, sum_axis_oob)
+{
+    auto param_0 = make_shared<op::Parameter>(element::f32, Shape{2, 4});
+    try
+    {
+        auto r = make_shared<op::Sum>(param_0, AxisSet{0, 2, 1});
+        // Should have thrown, so fail if it didn't
+        FAIL() << "Did not detect out-of-bound axis for sum";
+    }
+    catch (const ngraph_error& error)
+    {
+        EXPECT_EQ(error.what(),
+                  std::string("Reduction axis for arithmetic reduction operator is out of bounds"));
+    }
+    catch (...)
+    {
+        FAIL() << "Deduced type check failed for unexpected reason";
+    }
+}
--- a/test/util/benchmark.cpp
+++ b/test/util/benchmark.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "benchmark.hpp"
+#include "ngraph/runtime/backend.hpp"
+#include "ngraph/runtime/call_frame.hpp"
+#include "ngraph/runtime/manager.hpp"
+#include "ngraph/runtime/tensor_view.hpp"
+#include "ngraph/serializer.hpp"
+#include "random.hpp"
+std::multimap<size_t, std::string>
+    aggregate_timing(const std::vector<ngraph::runtime::PerformanceCounter>& perf_data)
+{
+    std::unordered_map<std::string, size_t> timing;
+    for (const ngraph::runtime::PerformanceCounter& p : perf_data)
+    {
+        std::string op = p.name().substr(0, p.name().find('_'));
+        timing[op] += p.microseconds();
+    }
+    std::multimap<size_t, std::string> rc;
+    for (const std::pair<std::string, size_t>& t : timing)
+    {
+        rc.insert({t.second, t.first});
+    }
+    return rc;
+}
+void run_benchmark(const std::string& json_path, const std::string& backend_name, size_t iterations)
+{
+    using namespace std;
+    using namespace ngraph;
+    string env_var_name = "NGRAPH_" + backend_name + "_EMIT_TIMING";
+    bool emit_timing = (std::getenv(env_var_name.c_str()) != nullptr);
+    if (!emit_timing)
+    {
+        cout << "To get per-op timing set the environment variable " << env_var_name << "\n";
+    }
+    ngraph::test::Uniform<float> rng{-1, 1, 0};
+    const string json_string = file_util::read_file_to_string(json_path);
+    stringstream ss(json_string);
+    shared_ptr<Function> f = deserialize(ss);
+    stopwatch build_time;
+    build_time.start();
+    auto manager = runtime::Manager::get(backend_name);
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+    build_time.stop();
+    cout << "build_time " << build_time.get_milliseconds() << "ms" << endl;
+    vector<shared_ptr<runtime::TensorView>> args;
+    for (shared_ptr<op::Parameter> param : f->get_parameters())
+    {
+        auto tensor =
+            backend->make_primary_tensor_view(param->get_element_type(), param->get_shape());
+        rng.initialize(tensor);
+        args.push_back(tensor);
+    }
+    vector<shared_ptr<runtime::TensorView>> results;
+    for (shared_ptr<Node> out : f->get_results())
+    {
+        auto result = backend->make_primary_tensor_view(out->get_element_type(), out->get_shape());
+        results.push_back(result);
+    }
+    stopwatch t1;
+    t1.start();
+    for (size_t i = 0; i < static_cast<size_t>(iterations); i++)
+    {
+        cf->tensor_call(args, results);
+    }
+    t1.stop();
+    float time = t1.get_milliseconds();
+    cout << time / iterations << "ms per iteration" << endl;
+    vector<runtime::PerformanceCounter> perf_data = cf->get_performance_data();
+    sort(perf_data.begin(),
+         perf_data.end(),
+         [](const runtime::PerformanceCounter& p1, const runtime::PerformanceCounter& p2) {
+             return p1.total_microseconds() > p2.total_microseconds();
+         });
+    multimap<size_t, string> timing = aggregate_timing(perf_data);
+    for (auto it = timing.rbegin(); it != timing.rend(); it++)
+    {
+        cout.imbue(locale(""));
+        cout << setw(15) << left << it->second << " " << setw(10) << right << it->first << "us\n";
+    }
+}
--- a/test/util/benchmark.hpp
+++ b/test/util/benchmark.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <ngraph/runtime/call_frame.hpp>
+#include "test_tools.hpp"
+/// performance test utilities
+std::multimap<size_t, std::string>
+    aggregate_timing(const std::vector<ngraph::runtime::PerformanceCounter>& perf_data);
+void run_benchmark(const std::string& json_path,
+                   const std::string& backend_name,
+                   size_t iterations);
--- a/test/util/test_tools.hpp
+++ b/test/util/test_tools.hpp
@@ -21,7 +21,9 @@
 #include <memory>
 #include "ngraph/descriptor/layout/tensor_view_layout.hpp"
+#include "ngraph/file_util.hpp"
 #include "ngraph/runtime/tensor_view.hpp"
+#include "ngraph/serializer.hpp"
 #define SKIP_TEST_FOR(backend_to_skip, current_backend)                                            \
    if (backend_to_skip == current_backend)                                                        \
@@ -80,3 +82,4 @@ size_t count_ops_of_type(std::shared_ptr<ngraph::Function> f)
    return count;
 }