Merge branch 'master' into mkldnn-compile

c36b1b10 · Jaikrishnan Menon · 1bb4f3be · 62342c4e · c36b1b10 · c36b1b10
Commit c36b1b10 authored Feb 14, 2018 by Jaikrishnan Menon
31 changed files
--- a/doc/sphinx/source/ops/broadcast.rst
+++ b/doc/sphinx/source/ops/broadcast.rst
+.. broadcast.rst:
+#########
+Broadcast
+#########
+Description
+===========
+Operation whose ``output`` tensor ignores axes not in the ``arg``
+tensor.
+Inputs
+------
+-----------------+-------------------------+--------------------------------+
+| Name            | Element Type            | Shape                          |
+=================+=========================+================================+
+| ``arg``         | Any                     | Any                            |
+-----------------+-------------------------+--------------------------------+
+Attributes
+----------
+---------------------+---------------+------------------------------------+
+| Name                | Type          | Notes                              |
+=====================+===============+====================================+
+| ``shape``           | ``Shape``     | The shape of the output.           |
+---------------------+---------------+------------------------------------+
+| ``broadcast_axes``  | ``AxisSet``   | Axis positions in ``shape`` that   |
+|                     |               | are broadcast.                     |
+---------------------+---------------+------------------------------------+
+Outputs
+-------
+-----------------+-------------------------+--------------------------------+
+| Name            | Element Type            | Shape                          |
+=================+=========================+================================+
+| ``output``      | Same as ``arg``         | Same as ``shape``.             |
+-----------------+-------------------------+--------------------------------+
+The shape of ``arg`` must match ``shape`` with elements in ``broadcast_axes`` removed.
+For example, if ``arg`` is :math:`[a, b, c]` then
+.. math::
+   \texttt{Broadcast(arg, Shape{2, 3}, AxisSet{0})} &=
+   \begin{bmatrix}
+   a & b & c\\
+   a & b & c
+   \end{bmatrix}\\
+   \texttt{Broadcast(arg, Shape{3, 2}, AxisSet{1})} &=
+   \begin{bmatrix}
+   a & a\\
+   b & b\\
+   c & c
+   \end{bmatrix}
+Mathematical Definition
+=======================
+For a coordinate :math:`C`, let :math:`p(C)` be a coordinate with the
+axes in ``broadcast_axes`` removed.  For example, if
+:math:`\texttt{broadcast_axes}=\{1,3\}` then :math:`p([d_0, d_1,
+d_2, d_3, d_4]) = [d_0, d_2, d_4]`.  Then
+.. math::
+   \texttt{output}_C = \texttt{arg}_{p(C)}.
+Backprop
+========
+.. math::
+   \overline{\texttt{arg}} \leftarrow \texttt{Sum}(\Delta, \texttt{broadcast_axes}).
+C++ Interface
+=============
+.. doxygenclass:: ngraph::op::Broadcast
+   :members:
--- a/doc/sphinx/source/ops/ceiling.rst
+++ b/doc/sphinx/source/ops/ceiling.rst
+.. ceiling.rst:
+#######
+Ceiling
+#######
+Description
+===========
+Elementwise ceiling operation.
+Produces a single output tensor of the same element type and shape as ``arg``,
+where the value at each coordinate of ``output`` is the ceiling of the
+value at each ``arg`` coordinate.
+Inputs
+------
+-----------------+-------------------------+--------------------------------+
+| Name            | Element Type            | Shape                          |
+=================+=========================+================================+
+| ``arg``         | Any                     | Any                            |
+-----------------+-------------------------+--------------------------------+
+Outputs
+-------
+-----------------+-------------------------+--------------------------------+
+| Name            | Element Type            | Shape                          |
+=================+=========================+================================+
+| ``output``      | Same as ``arg``         | Same as ``arg``.               |
+-----------------+-------------------------+--------------------------------+
+Mathematical Definition
+=======================
+.. math::
+   \mathtt{output}_{i_0, \ldots, i_{n-1}} = \lceil \mathtt{arg}_{i_0,
+   \ldots, i_{n-1}}\rceil
+Backprop
+========
+Not defined by nGraph.
+The backprop would be zero for non-integer
+input and undefined for integer input; a zero backprop would have
+no effect on the backprop to ``arg``, so there is no need for ``Ceiling``
+to define a backprop.
+C++ Interface
+=============
+.. doxygenclass:: ngraph::op::Ceiling
+   :members:
--- a/doc/sphinx/source/ops/concatenate.rst
+++ b/doc/sphinx/source/ops/concatenate.rst
+.. concatenate.rst:
+###########
+Concatenate
+###########
+Description
+===========
+Produces a single output tensor of the same element type and shape as ``arg``,
+where the value at each coordinate of ``output`` is the absoloute value of the
+value at each ``arg`` coordinate.
+Inputs
+------
+-----------------+-----------------+------------------------------------------------------+
+| Name            | Type            | Notes                                                |
+=================+=================+======================================================+
+| ``args``        | ``Nodes``       | All element types the same.                          |
+|                 |                 | All shapes the same except on ``concatenation_axis`` |
+-----------------+-----------------+------------------------------------------------------+
+Attributes
+----------
+-------------------------+----------------------------------+
+| Name                    | Notes                            |
+=========================+==================================+
+| ``concatenation_axis``  | Less than the rank of the shape. |
+-------------------------+----------------------------------+
+Outputs
+-------
+-----------------+-------------------------+---------------------------------------------------+
+| Name            | Element Type            | Shape                                             |
+=================+=========================+===================================================+
+| ``output``      | Same as ``args`         | Same as ``arg`` on non-``concatenation_axis``     |
+|                 |                         | Sum of ``concatenation_axis`` lengths of ``args`` |
+-----------------+-------------------------+---------------------------------------------------+
+Mathematical Definition
+=======================
+We map each tensor in ``args`` to a segment of ``output`` based on the
+coordinate at ``coordinate_axis``.
+Let
+.. math::
+   s(i) &= \sum_{j<i} \texttt{args}[i].\texttt{shape}\left[\texttt{concatenation_axis}\right]\\
+   t(i) &= \text{The greatest }j\text{ such that }i \ge s(j)\\
+   p(C)_i &= \begin{cases}
+   C_i-s(t(i))&\text{if }i==\texttt{concatenation_axis}\\
+   C_i&\text{otherwise}
+   \end{cases}\\
+   \texttt{output}_C&=\texttt{args}[t(C_i)]_{p(C)}
+Backprop
+========
+We slice the backprop value into the backprops associated with the inputs.
+C++ Interface
+=============
+.. doxygenclass:: ngraph::op::Concatenate
+   :members:
--- a/doc/sphinx/source/ops/index.rst
+++ b/doc/sphinx/source/ops/index.rst
@@ -55,4 +55,7 @@ Not currently a comprehensive list.
   atan.rst
   avg_pool.rst
   avg_pool_backprop.rst
+   broadcast.rst
+   ceiling.rst
+   concatenate.rst
   convolution.rst
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -169,11 +169,16 @@ if (NGRAPH_CPU_ENABLE AND LLVM_INCLUDE_DIR AND
        runtime/cpu/cpu_kernel_utils.cpp
        runtime/cpu/cpu_emitter.cpp
        runtime/cpu/cpu_external_function.cpp
+        runtime/cpu/cpu_tensor_view.cpp
        runtime/cpu/cpu_tensor_view_wrapper.cpp
+        runtime/cpu/cpu_layout_descriptor.cpp
        runtime/cpu/cpu_tracing.cpp
        runtime/cpu/mkldnn_invoke.cpp
+        runtime/cpu/mkldnn_utils.cpp
+        runtime/cpu/ops/convert_layout.cpp
        runtime/cpu/ops/matmul_bias.cpp
        runtime/cpu/pass/cpu_fusion.cpp
+        runtime/cpu/pass/cpu_layout.cpp
    )
    # LLVM binary builds are typically built without RTTI
    # The built-in headers are in a version-specific directory

--- a/src/ngraph/ops/broadcast.hpp
+++ b/src/ngraph/ops/broadcast.hpp
@@ -23,34 +23,6 @@ namespace ngraph
    namespace op
    {
        /// \brief Operation which "adds" axes to an input tensor, replicating elements from the input as needed along the new axes.
-        ///
-        /// Informally, a broadcast "adds" axes to the input tensor, replicating elements from the input tensor as needed to fill the new dimensions.
-        /// The parameter `m_broadcast_axes` indicates which of the output axes is being so added. For example, an output shape of `{2,5,6,2,8}` and
-        /// broadcast axes of `{1,3,4}` means that the input must have shape `{2,6}`.
-        ///
-        /// Formally, given a shape or coordinate \f$S = [d_1,\dots,d_n]\f$ and a set of axis indices \f$A\f$, define \f$\textit{del}(S,A)\f$ to be
-        /// the shape or coordinate obtained by deleting the \f$(a + 1)\f$th dimension from \f$S\f$ for each \f$a \in A\f$. Then given an input
-        /// tensor \f$T\f$ of shape \f$\textit{del}(S,A)\f$ with element type \f$E\f$, broadcasting axes \f$A\f$ produces a tensor \f$T'\f$ of shape
-        /// \f$S\f$ with element type \f$E\f$, where \f$T'[i_1,\dots,i_n] = T[del([i_1,\dots,i_n],A)]\f$.
-        ///
-        /// ## Parameters
-        ///
-        /// |                  | Description                                                              |
-        /// | ---------------- | ------------------------------------------------------------------------ |
-        /// | `shape`          | The shape \f$[d_1,\dots,d_n]\f$ of the broadcasted output.               |
-        /// | `broadcast_axes` | The indices \f$A\f$ in the `shape` of each broadcasted (i.e., new) axis. |
-        ///
-        /// ## Inputs
-        ///
-        /// |       | Type                                                | Description                             |
-        /// | ----- | --------------------------------------------------- | --------------------------------------- |
-        /// | `arg` | \f$E[\mathit{del}([d_1,\dots,d_n],A)]~(n \geq 0)\f$ | A tensor of any shape and element type. |
-        ///
-        /// ## Output
-        ///
-        /// | Type                   | Description                                                                     |
-        /// | ---------------------- | ------------------------------------------------------------------------------- |
-        /// | \f$E[d_1,\dots,d_n]\f$ | The tensor \f$T'\f$, where \f$T'[i_1,\dots,i_n] = T[del([i_1,\dots,i_n],A)]\f$. |
        class Broadcast : public RequiresTensorViewArgs
        {
        public:

--- a/src/ngraph/ops/ceiling.hpp
+++ b/src/ngraph/ops/ceiling.hpp
@@ -23,18 +23,6 @@ namespace ngraph
    namespace op
    {
        /// \brief Elementwise ceiling operation.
-        ///
-        /// ## Inputs
-        ///
-        /// |       | Type                              | Description                                     |
-        /// | ----- | --------------------------------- | ----------------------------------------------- |
-        /// | `arg` | \f$N[d_1,\dots,d_n]~(n \geq 0)\f$ | A tensor of any shape and numeric element type. |
-        ///
-        /// ## Output
-        ///
-        /// | Type                   | Description                                                                                  |
-        /// | ---------------------- | -------------------------------------------------------------------------------------------- |
-        /// | \f$N[d_1,\dots,d_n]\f$ | The tensor \f$T\f$, where \f$T[i_1,\dots,i_n] = \lceil \texttt{arg}[i_1,\dots,i_n] \rceil\f$ |
        class Ceiling : public UnaryElementwiseArithmetic
        {
        public:

--- a/src/ngraph/ops/concatenate.hpp
+++ b/src/ngraph/ops/concatenate.hpp
@@ -25,40 +25,6 @@ namespace ngraph
    namespace op
    {
        /// \brief Concatenation operation.
-        ///
-        /// Given an axis index \f$a\f$ and a rank \f$r \geq 1\f$ where \f$0 \leq a \lt r\f$, and one or more \f$r\f$-tensors
-        /// with the same element type whose shapes are the same except possibly at axis \f$a\f$, the tensors are
-        /// concatenated along axis \f$a\f$.
-        ///
-        /// For example:
-        ///   1. Concatenating matrices on axis 0 (the row axis) stacks the matrices from top to bottom.
-        ///      The number of rows in the resulting matrix is the sum of the number of rows for each
-        ///      input matrix.
-        ///   2. Concatenating matrices on axis 1 (the column axis) concatenates them from left to right.
-        ///      The number of columns in the resulting matrix is the sum of the number of columns for each
-        ///      input matrix.
-        ///   3. Concatenating 3-tensors on axis 2 (the depth axis) stacks them from front to back.
-        ///      The depth of the resulting tensor is the sum of the total depth for each input tensor.
-        ///
-        /// The resulting tensor will have the same rank as the input tensors.
-        ///
-        /// ## Parameters
-        ///
-        /// |                      | Description                                                    |
-        /// | -------------------- | -------------------------------------------------------------- |
-        /// | `concatenation_axis` | The axis \f$a\f$ along which to concatenate the input tensors. |
-        ///
-        /// ## Inputs
-        ///
-        /// |                 | Type                                                          | Description                                                                                                              |
-        /// | --------------- | ------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------ |
-        /// | `args`[\f$i\f$] | \f$E[d_1,\dots,d_{a-1},d^i_a,d_{a+1},\dots,d_n]~(n \geq 1)\f$ | One or more input tensors, all of which have the same element type, and the same shape, except possibly at axis \f$a\f$. |
-        ///
-        /// ## Output
-        ///
-        /// | Type                                                         | Description                                                                                     |
-        /// | ------------------------------------------------------------ | ----------------------------------------------------------------------------------------------- |
-        /// | \f$E[d_1,\dots,d_{a-1},\Sigma_i(d^i_a),d_{a+1},\dots,d_n]\f$ | The tensor \f$T\f$, where \f$T\f$ is the concatenation of the input tensors along axis \f$a\f$. |
        class Concat : public RequiresTensorViewArgs
        {
        public:

--- a/src/ngraph/runtime/cpu/cpu_backend.cpp
+++ b/src/ngraph/runtime/cpu/cpu_backend.cpp
@@ -16,8 +16,8 @@
 #include "ngraph/runtime/cpu/cpu_backend.hpp"
 #include "ngraph/log.hpp"
+#include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
 #include "ngraph/runtime/external_function.hpp"
-#include "ngraph/runtime/host_tensor_view.hpp"
 using namespace ngraph;
 using namespace std;
@@ -32,6 +32,6 @@ std::shared_ptr<ngraph::runtime::TensorView>
    runtime::cpu::CPU_Backend::make_primary_tensor_view(const ngraph::element::Type& element_type,
                                                        const Shape& shape)
 {
-    auto rc = make_shared<runtime::HostTensorView>(element_type, shape);
+    auto rc = make_shared<runtime::cpu::CPUTensorView>(element_type, shape);
    return dynamic_pointer_cast<runtime::TensorView>(rc);
 }
--- a/src/ngraph/runtime/cpu/cpu_call_frame.cpp
+++ b/src/ngraph/runtime/cpu/cpu_call_frame.cpp
@@ -18,8 +18,8 @@
 #include "ngraph/runtime/cpu/cpu_call_frame.hpp"
 #include "ngraph/runtime/cpu/cpu_external_function.hpp"
+#include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
 #include "ngraph/runtime/cpu/cpu_tracing.hpp"
-#include "ngraph/runtime/host_tensor_view.hpp"
 using namespace std;
 using namespace ngraph;
@@ -43,16 +43,20 @@ void runtime::cpu::CPU_CallFrame::tensor_call(
 {
    vector<void*> inputs;
    vector<void*> outputs;
+    propagate_layouts(input_tvs, m_external_function->get_parameter_layout_descriptors());
+    propagate_layouts(output_tvs, m_external_function->get_result_layout_descriptors());
    for (size_t i = 0; i < input_tvs.size(); i++)
    {
-        shared_ptr<runtime::HostTensorView> tv =
+        shared_ptr<runtime::cpu::CPUTensorView> tv =
-            static_pointer_cast<runtime::HostTensorView>(input_tvs[i]);
+            static_pointer_cast<runtime::cpu::CPUTensorView>(input_tvs[i]);
        inputs.push_back(tv->get_data_ptr());
    }
    for (size_t i = 0; i < output_tvs.size(); i++)
    {
-        shared_ptr<runtime::HostTensorView> tv =
+        shared_ptr<runtime::cpu::CPUTensorView> tv =
-            static_pointer_cast<runtime::HostTensorView>(output_tvs[i]);
+            static_pointer_cast<runtime::cpu::CPUTensorView>(output_tvs[i]);
        outputs.push_back(tv->get_data_ptr());
    }
@@ -85,6 +89,26 @@ void runtime::cpu::CPU_CallFrame::call(
    tensor_call(inputs, outputs);
 }
+void runtime::cpu::CPU_CallFrame::propagate_layouts(
+    const std::vector<std::shared_ptr<runtime::TensorView>>& tvs,
+    const LayoutDescriptorPtrs& layouts) const
+{
+    if (layouts.size() != tvs.size())
+    {
+        throw ngraph_error(
+            "Error propagating layouts - tensor view and layout descriptor counts do not match");
+    }
+    for (size_t i = 0; i < tvs.size(); i++)
+    {
+        if (layouts[i] == nullptr)
+        {
+            throw ngraph_error(
+                "Error propagating layouts - layout information missing from tensor view");
+        }
+        tvs[i]->get_descriptor()->set_tensor_view_layout(layouts[i]);
+    }
+}
 vector<runtime::PerformanceCounter> runtime::cpu::CPU_CallFrame::get_performance_data() const
 {
    vector<runtime::PerformanceCounter> rc;

--- a/src/ngraph/runtime/cpu/cpu_call_frame.hpp
+++ b/src/ngraph/runtime/cpu/cpu_call_frame.hpp
@@ -22,6 +22,7 @@
 #include "ngraph/function.hpp"
 #include "ngraph/runtime/call_frame.hpp"
+#include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
 #include "ngraph/runtime/cpu/cpu_runtime_context.hpp"
 #include "ngraph/runtime/tensor_view.hpp"
@@ -60,6 +61,9 @@ namespace ngraph
                void tensor_call(const std::vector<std::shared_ptr<TensorView>>& inputs,
                                 const std::vector<std::shared_ptr<TensorView>>& outputs) override;
+                void propagate_layouts(const std::vector<std::shared_ptr<runtime::TensorView>>& tvs,
+                                       const LayoutDescriptorPtrs& layouts) const;
                std::vector<ngraph::runtime::PerformanceCounter>
                    get_performance_data() const override;

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -97,21 +97,15 @@ void runtime::cpu::CPU_Emitter::EmitMKLDNNPreamble(codegen::CodeWriter& writer)
    writer << "using namespace mkldnn;\n\n";
 }
-void runtime::cpu::CPU_Emitter::EmitNop(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitNop)
-                                        const ngraph::Node* n,
-                                        const vector<runtime::cpu::TensorViewWrapper>& args,
-                                        const vector<runtime::cpu::TensorViewWrapper>& out)
 {
 }
-void runtime::cpu::CPU_Emitter::EmitAdd(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitAdd)
-                                        const ngraph::Node* n,
-                                        const vector<runtime::cpu::TensorViewWrapper>& args,
-                                        const vector<runtime::cpu::TensorViewWrapper>& out)
 {
    // TODO: Audit all uses of Add and fix this to use
    // the right alignment instead of Eigen::Unaligned
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << "Eigen::Map<Eigen::Array<" << out[0].get_element_type().c_type_string() << ", "
@@ -137,10 +131,7 @@ void runtime::cpu::CPU_Emitter::EmitAdd(codegen::CodeWriter& writer,
 //a) emitting customized code for initializing output/bias
 //b) emitting two cblas calls (one for gemm on W and x and the second for gemm on Bias and E^T + the result of the first gemm)
 //@jbobba suggests b) is more efficient but we should benchmark both
-void runtime::cpu::CPU_Emitter::EmitMatmulBias(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitMatmulBias)
-                                               const ngraph::Node* node,
-                                               const vector<runtime::cpu::TensorViewWrapper>& args,
-                                               const vector<runtime::cpu::TensorViewWrapper>& out)
 {
    const ngraph::op::MatmulBias* cg = static_cast<const ngraph::op::MatmulBias*>(node);
@@ -189,12 +180,9 @@ void runtime::cpu::CPU_Emitter::EmitMatmulBias(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitDot(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitDot)
-                                        const ngraph::Node* n,
-                                        const vector<runtime::cpu::TensorViewWrapper>& args,
-                                        const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    const ngraph::op::Dot* dot = static_cast<const ngraph::op::Dot*>(n);
+    const ngraph::op::Dot* dot = static_cast<const ngraph::op::Dot*>(node);
    const Shape& arg0_shape = args[0].get_shape();
    const Shape& arg1_shape = args[1].get_shape();
@@ -203,7 +191,7 @@ void runtime::cpu::CPU_Emitter::EmitDot(codegen::CodeWriter& writer,
        auto& first = (arg0_shape.empty() ? args[0] : args[1]);
        auto& second = (arg0_shape.empty() ? args[1] : args[0]);
-        writer << "{   // " << n->get_name() << "\n";
+        writer << "{   // " << node->get_name() << "\n";
        writer.indent++;
        writer << emit_vector(out[0]) << "\n    = ";
        writer << first.get_name() << "[0]\n    * " << emit_vector(second) << ";\n";
@@ -213,7 +201,7 @@ void runtime::cpu::CPU_Emitter::EmitDot(codegen::CodeWriter& writer,
    else if ((arg0_shape.size() == 1) && (arg1_shape.size() == 1) &&
             dot->get_reduction_axes_count() == 1)
    {
-        writer << "{   // " << n->get_name() << "\n";
+        writer << "{   // " << node->get_name() << "\n";
        writer.indent++;
        writer << emit_vector(out[0]) << " << \n"
               << "    " << emit_vector(args[0]) << ".dot(" << emit_vector(args[1]) << ");\n";
@@ -223,7 +211,7 @@ void runtime::cpu::CPU_Emitter::EmitDot(codegen::CodeWriter& writer,
    else if ((arg0_shape.size() == 2) && (arg1_shape.size() == 1) &&
             dot->get_reduction_axes_count() == 1)
    {
-        writer << "{   // " << n->get_name() << "\n";
+        writer << "{   // " << node->get_name() << "\n";
        writer.indent++;
        writer << emit_vector(out[0]) << " = \n"
               << "    " << emit_matrix(args[0]) << " * " << emit_vector(args[1]) << ";\n";
@@ -237,7 +225,7 @@ void runtime::cpu::CPU_Emitter::EmitDot(codegen::CodeWriter& writer,
        // clang-format off
        if (args[0].get_element_type() == element::f32)
        {
-            writer << "{   // " << n->get_name() << "\n";
+            writer << "{   // " << node->get_name() << "\n";
            writer.indent++;
            writer << "cblas::cblas_sgemm("
               << "cblas::Layout::RowMajor, "
@@ -252,7 +240,7 @@ void runtime::cpu::CPU_Emitter::EmitDot(codegen::CodeWriter& writer,
        // clang-format on
        else
        {
-            writer << "{   // " << n->get_name() << "\n";
+            writer << "{   // " << node->get_name() << "\n";
            writer.indent++;
            writer << emit_matrix(out[0]) << " = \n"
                   << "    " << emit_matrix(args[0]) << " * " << emit_matrix(args[1]) << ";\n";
@@ -272,12 +260,9 @@ void runtime::cpu::CPU_Emitter::EmitDot(codegen::CodeWriter& writer,
    }
 }
-void runtime::cpu::CPU_Emitter::EmitMultiply(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitMultiply)
-                                             const ngraph::Node* n,
-                                             const vector<runtime::cpu::TensorViewWrapper>& args,
-                                             const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -295,15 +280,11 @@ void runtime::cpu::CPU_Emitter::EmitMultiply(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitGetOutputElement(
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitGetOutputElement)
-    codegen::CodeWriter& writer,
-    const ngraph::Node* n,
-    const vector<runtime::cpu::TensorViewWrapper>& args,
-    const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    auto get_tuple_element = static_cast<const op::GetOutputElement*>(n);
+    auto get_tuple_element = static_cast<const op::GetOutputElement*>(node);
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
    writer << "memcpy(" << out[0].get_name() << ", " << args[get_tuple_element->get_n()].get_name()
           << ", " << out[0].get_size() * out[0].get_element_type().size() << ");\n";
@@ -311,12 +292,9 @@ void runtime::cpu::CPU_Emitter::EmitGetOutputElement(
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitTuple(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitTuple)
-                                          const ngraph::Node* n,
-                                          const vector<runtime::cpu::TensorViewWrapper>& args,
-                                          const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
    for (size_t i = 0; i < args.size(); ++i)
    {
@@ -327,12 +305,9 @@ void runtime::cpu::CPU_Emitter::EmitTuple(codegen::CodeWriter& writer,
    writer += "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitAbs(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitAbs)
-                                        const ngraph::Node* n,
-                                        const vector<runtime::cpu::TensorViewWrapper>& args,
-                                        const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n";
@@ -348,17 +323,14 @@ void runtime::cpu::CPU_Emitter::EmitAbs(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitConcat(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitConcat)
-                                           const ngraph::Node* n,
-                                           const vector<runtime::cpu::TensorViewWrapper>& args,
-                                           const vector<runtime::cpu::TensorViewWrapper>& out)
 {
    auto result_shape = out[0].get_shape();
 #if PREFER_EIGEN == 1
    if (result_shape.size() == 1)
    {
-        writer << "{   // " << n->get_name() << "\n";
+        writer << "{   // " << node->get_name() << "\n";
        writer.indent++;
        writer << emit_vector(out[0], "out_vector") << ";\n";
@@ -374,9 +346,9 @@ void runtime::cpu::CPU_Emitter::EmitConcat(codegen::CodeWriter& writer,
    }
    else if (result_shape.size() == 2)
    {
-        auto axis = (dynamic_cast<const op::Concat*>(n))->get_concatenation_axis();
+        auto axis = (dynamic_cast<const op::Concat*>(node))->get_concatenation_axis();
-        writer << "{   // " << n->get_name() << "\n";
+        writer << "{   // " << node->get_name() << "\n";
        writer.indent++;
        writer << emit_matrix(out[0], "out_matrix") << ";\n";
@@ -439,7 +411,7 @@ void runtime::cpu::CPU_Emitter::EmitConcat(codegen::CodeWriter& writer,
        }
    }
 #else
-    auto axis = (dynamic_cast<const op::Concat*>(n))->get_concatenation_axis();
+    auto axis = (dynamic_cast<const op::Concat*>(node))->get_concatenation_axis();
    std::vector<std::string> arg_names;
    std::vector<Shape> arg_shapes;
@@ -460,14 +432,11 @@ void runtime::cpu::CPU_Emitter::EmitConcat(codegen::CodeWriter& writer,
 #endif
 }
-void runtime::cpu::CPU_Emitter::EmitDivide(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitDivide)
-                                           const ngraph::Node* n,
-                                           const vector<runtime::cpu::TensorViewWrapper>& args,
-                                           const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
-    if (n->get_element_type().is_real() == false)
+    if (node->get_element_type().is_real() == false)
    {
        // Check for divide by zero for integer types only
        size_t element_count = args[1].get_size();
@@ -493,12 +462,9 @@ void runtime::cpu::CPU_Emitter::EmitDivide(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitEqual(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitEqual)
-                                          const ngraph::Node* n,
-                                          const vector<runtime::cpu::TensorViewWrapper>& args,
-                                          const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -516,12 +482,9 @@ void runtime::cpu::CPU_Emitter::EmitEqual(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitGreater(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitGreater)
-                                            const ngraph::Node* n,
-                                            const vector<runtime::cpu::TensorViewWrapper>& args,
-                                            const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << " xxx\n";
+    writer << "{   // " << node->get_name() << " xxx\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -539,12 +502,9 @@ void runtime::cpu::CPU_Emitter::EmitGreater(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitGreaterEq(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitGreaterEq)
-                                              const ngraph::Node* n,
-                                              const vector<runtime::cpu::TensorViewWrapper>& args,
-                                              const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -562,12 +522,9 @@ void runtime::cpu::CPU_Emitter::EmitGreaterEq(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitLess(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitLess)
-                                         const ngraph::Node* n,
-                                         const vector<runtime::cpu::TensorViewWrapper>& args,
-                                         const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -585,12 +542,9 @@ void runtime::cpu::CPU_Emitter::EmitLess(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitLessEq(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitLessEq)
-                                           const ngraph::Node* n,
-                                           const vector<runtime::cpu::TensorViewWrapper>& args,
-                                           const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -608,12 +562,9 @@ void runtime::cpu::CPU_Emitter::EmitLessEq(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitLog(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitLog)
-                                        const ngraph::Node* n,
-                                        const vector<runtime::cpu::TensorViewWrapper>& args,
-                                        const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -629,12 +580,9 @@ void runtime::cpu::CPU_Emitter::EmitLog(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitMaximum(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitMaximum)
-                                            const ngraph::Node* n,
-                                            const vector<runtime::cpu::TensorViewWrapper>& args,
-                                            const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -653,12 +601,9 @@ void runtime::cpu::CPU_Emitter::EmitMaximum(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitMinimum(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitMinimum)
-                                            const ngraph::Node* n,
-                                            const vector<runtime::cpu::TensorViewWrapper>& args,
-                                            const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -677,12 +622,9 @@ void runtime::cpu::CPU_Emitter::EmitMinimum(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitNegative(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitNegative)
-                                             const ngraph::Node* n,
-                                             const vector<runtime::cpu::TensorViewWrapper>& args,
-                                             const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -698,12 +640,9 @@ void runtime::cpu::CPU_Emitter::EmitNegative(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitNotEqual(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitNotEqual)
-                                             const ngraph::Node* n,
-                                             const vector<runtime::cpu::TensorViewWrapper>& args,
-                                             const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -721,12 +660,9 @@ void runtime::cpu::CPU_Emitter::EmitNotEqual(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitSelect(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitSelect)
-                                           const ngraph::Node* n,
-                                           const vector<runtime::cpu::TensorViewWrapper>& args,
-                                           const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -745,12 +681,9 @@ void runtime::cpu::CPU_Emitter::EmitSelect(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitSubtract(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitSubtract)
-                                             const ngraph::Node* n,
-                                             const vector<runtime::cpu::TensorViewWrapper>& args,
-                                             const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -768,14 +701,11 @@ void runtime::cpu::CPU_Emitter::EmitSubtract(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitBroadcast(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitBroadcast)
-                                              const ngraph::Node* n,
-                                              const vector<runtime::cpu::TensorViewWrapper>& args,
-                                              const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    auto broadcast = static_cast<const op::Broadcast*>(n);
+    auto broadcast = static_cast<const op::Broadcast*>(node);
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    auto arg_shape = args[0].get_shape();
@@ -783,7 +713,7 @@ void runtime::cpu::CPU_Emitter::EmitBroadcast(codegen::CodeWriter& writer,
    if (broadcast->get_broadcast_axes().empty())
    {
-        writer << "{   // " << n->get_name() << "\n";
+        writer << "{   // " << node->get_name() << "\n";
        writer.indent++;
        writer << "memcpy(" << out[0].get_name() << ", " << args[0].get_name() << ", "
               << out[0].get_size() * out[0].get_element_type().size() << ");\n";
@@ -792,7 +722,7 @@ void runtime::cpu::CPU_Emitter::EmitBroadcast(codegen::CodeWriter& writer,
    }
    else if (arg_shape.size() == 0)
    {
-        writer << "{   // " << n->get_name() << "\n";
+        writer << "{   // " << node->get_name() << "\n";
        writer.indent++;
        writer << emit_array1d(out[0]) << " =\n"
               << "    " << emit_array1d(args[0]) << "(0, 0);\n";
@@ -803,7 +733,7 @@ void runtime::cpu::CPU_Emitter::EmitBroadcast(codegen::CodeWriter& writer,
    {
        if (broadcast->get_broadcast_axes() == AxisSet{1})
        {
-            writer << "{   // " << n->get_name() << "\n";
+            writer << "{   // " << node->get_name() << "\n";
            writer.indent++;
            writer << emit_matrix(out[0]) << ".colwise() =\n"
                   << "    " << emit_vector(args[0]) << ";\n";
@@ -812,7 +742,7 @@ void runtime::cpu::CPU_Emitter::EmitBroadcast(codegen::CodeWriter& writer,
        }
        else if (broadcast->get_broadcast_axes() == AxisSet{0})
        {
-            writer << "{   // " << n->get_name() << "\n";
+            writer << "{   // " << node->get_name() << "\n";
            writer.indent++;
            writer << "Eigen::Map<Eigen::Matrix<" << out[0].get_element_type().c_type_string()
@@ -856,14 +786,11 @@ void runtime::cpu::CPU_Emitter::EmitBroadcast(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitConvert(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitConvert)
-                                            const ngraph::Node* n,
-                                            const vector<runtime::cpu::TensorViewWrapper>& args,
-                                            const vector<runtime::cpu::TensorViewWrapper>& out)
 {
    auto& result_element_type = out[0].get_element_type();
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -881,20 +808,14 @@ void runtime::cpu::CPU_Emitter::EmitConvert(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitConstant(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitConstant)
-                                             const ngraph::Node* n,
-                                             const vector<runtime::cpu::TensorViewWrapper>& args,
-                                             const vector<runtime::cpu::TensorViewWrapper>& out)
 {
 }
-void runtime::cpu::CPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitReshape)
-                                            const ngraph::Node* n,
-                                            const vector<runtime::cpu::TensorViewWrapper>& args,
-                                            const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    auto reshape = static_cast<const op::Reshape*>(n);
+    auto reshape = static_cast<const op::Reshape*>(node);
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    auto arg_shape = args[0].get_shape();
@@ -917,7 +838,7 @@ void runtime::cpu::CPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
    //  we can just copy.
    if (same_layout || result_shape_product < 2)
    {
-        writer << "{   // " << n->get_name() << " 1\n";
+        writer << "{   // " << node->get_name() << " 1\n";
        writer.indent++;
        writer << "memcpy(" << out[0].get_name() << ", " << args[0].get_name() << ", "
               << out[0].get_size() * out[0].get_element_type().size() << ");\n";
@@ -931,7 +852,7 @@ void runtime::cpu::CPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
        // clang-format off
        if (result_element_type == ngraph::element::f32)
        {
-            writer << "{   // " << n->get_name() << " 2\n";
+            writer << "{   // " << node->get_name() << " 2\n";
            writer.indent++;
            writer << "mkl::MKL_Somatcopy('R', 'T', " << to_string(arg_shape[0]) << ",\n" <<
                "                   " << to_string(arg_shape[1]) << ", 1.0f,\n" <<
@@ -945,7 +866,7 @@ void runtime::cpu::CPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
        // clang-format on
        else
        {
-            writer << "{   // " << n->get_name() << " 3\n";
+            writer << "{   // " << node->get_name() << " 3\n";
            writer.indent++;
            writer << emit_matrix(out[0]) << " =\n"
                   << "        " << emit_matrix(args[0]) << ".transpose();\n";
@@ -976,13 +897,9 @@ void runtime::cpu::CPU_Emitter::EmitReshape(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitFunctionCall(
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitFunctionCall)
-    codegen::CodeWriter& writer,
-    const ngraph::Node* n,
-    const vector<runtime::cpu::TensorViewWrapper>& args,
-    const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    auto function_call = static_cast<const op::FunctionCall*>(n);
+    auto function_call = static_cast<const op::FunctionCall*>(node);
    shared_ptr<Function> function = function_call->get_functions()[0];
    writer << "{   // Call " << function->get_name() << "\n";
@@ -1026,12 +943,9 @@ void runtime::cpu::CPU_Emitter::EmitFunctionCall(
 // the compiled version of these ops is intended to have semantics identical
 // to what's seen there (for now atleast)
-void runtime::cpu::CPU_Emitter::EmitReduce(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitReduce)
-                                           const ngraph::Node* n,
-                                           const vector<runtime::cpu::TensorViewWrapper>& args,
-                                           const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    auto reduce = static_cast<const op::Reduce*>(n);
+    auto reduce = static_cast<const op::Reduce*>(node);
    auto reduction_function = reduce->get_functions()[0];
    auto reductee_shape = args[0].get_shape();
@@ -1044,7 +958,7 @@ void runtime::cpu::CPU_Emitter::EmitReduce(codegen::CodeWriter& writer,
    // Trivial case: no reduction axes (this includes the scalar-reductee case).
    if (reduction_axes.empty())
    {
-        writer << "{   // " << n->get_name() << " 1\n";
+        writer << "{   // " << node->get_name() << " 1\n";
        writer.indent++;
        writer << "memcpy(" << out[0].get_name() << ", " << args[0].get_name() << ", "
               << out[0].get_size() * out[0].get_element_type().size() << ");\n";
@@ -1081,7 +995,7 @@ void runtime::cpu::CPU_Emitter::EmitReduce(codegen::CodeWriter& writer,
    {
        if (reductee_shape.at(0) == 0 || (reductee_shape.size() == 2 && reductee_shape.at(1) == 0))
        {
-            writer << "{   // " << n->get_name() << " 2\n";
+            writer << "{   // " << node->get_name() << " 2\n";
            writer.indent++;
            writer << "memcpy(" << out[0].get_name() << ", " << args[1].get_name() << ", "
                   << out[0].get_size() * out[0].get_element_type().size() << ");\n";
@@ -1090,7 +1004,7 @@ void runtime::cpu::CPU_Emitter::EmitReduce(codegen::CodeWriter& writer,
        }
        else
        {
-            writer << "{   // " << n->get_name() << " 3\n";
+            writer << "{   // " << node->get_name() << " 3\n";
            writer.indent++;
            string type = f_result_element_type.c_type_string();
            writer << "auto f = [&](" << type << " x, " << type << " y) -> " << type << "\n{";
@@ -1113,7 +1027,7 @@ void runtime::cpu::CPU_Emitter::EmitReduce(codegen::CodeWriter& writer,
    {
        if (reductee_shape.at(1) == 0)
        {
-            writer << "{   // " << n->get_name() << " 4\n";
+            writer << "{   // " << node->get_name() << " 4\n";
            writer.indent++;
            writer << emit_array1d(out[0]) << " =\n"
                   << "    " << emit_array1d(args[1]) << "(0, 0);\n";
@@ -1126,7 +1040,7 @@ void runtime::cpu::CPU_Emitter::EmitReduce(codegen::CodeWriter& writer,
            //     dynamic_pointer_cast<CallFrame>(external->make_call_frame());
            // ef->get_callees().emplace_back(cf);
-            writer << "{   // " << n->get_name() << " 5\n";
+            writer << "{   // " << node->get_name() << " 5\n";
            writer.indent++;
            string type = f_result_element_type.c_type_string();
            writer << "auto f = [&](" << type << " x, " << type << " y) -> " << type << "\n{";
@@ -1149,7 +1063,7 @@ void runtime::cpu::CPU_Emitter::EmitReduce(codegen::CodeWriter& writer,
    {
        if (reductee_shape.at(0) == 0)
        {
-            writer << "{   // " << n->get_name() << " 6\n";
+            writer << "{   // " << node->get_name() << " 6\n";
            writer.indent++;
            writer << emit_array1d(out[0]) << " =\n"
                   << "    " << emit_array1d(args[1]) << "(0, 0);\n";
@@ -1158,7 +1072,7 @@ void runtime::cpu::CPU_Emitter::EmitReduce(codegen::CodeWriter& writer,
        }
        else
        {
-            writer << "{   // " << n->get_name() << " 7\n";
+            writer << "{   // " << node->get_name() << " 7\n";
            writer.indent++;
            string type = f_result_element_type.c_type_string();
            writer << "auto f = [&](" << type << " x, " << type << " y) -> " << type << "\n{";
@@ -1179,7 +1093,7 @@ void runtime::cpu::CPU_Emitter::EmitReduce(codegen::CodeWriter& writer,
    }
    else
    {
-        writer << "{   // " << n->get_name() << "\n";
+        writer << "{   // " << node->get_name() << "\n";
        writer.indent++;
        string type = f_result_element_type.c_type_string();
@@ -1206,7 +1120,7 @@ void runtime::cpu::CPU_Emitter::EmitReduce(codegen::CodeWriter& writer,
        writer << "}\n";
    }
 #else
-    writer << "{   // " << n->get_name() << " 1\n";
+    writer << "{   // " << node->get_name() << " 1\n";
    writer.indent++;
    string type = f_result_element_type.c_type_string();
@@ -1236,12 +1150,9 @@ void runtime::cpu::CPU_Emitter::EmitReduce(codegen::CodeWriter& writer,
 #endif
 }
-void runtime::cpu::CPU_Emitter::EmitSign(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitSign)
-                                         const ngraph::Node* n,
-                                         const vector<runtime::cpu::TensorViewWrapper>& args,
-                                         const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -1258,14 +1169,11 @@ void runtime::cpu::CPU_Emitter::EmitSign(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitSlice(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitSlice)
-                                          const ngraph::Node* n,
-                                          const vector<runtime::cpu::TensorViewWrapper>& args,
-                                          const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    const op::Slice* slice = static_cast<const op::Slice*>(n);
+    const op::Slice* slice = static_cast<const op::Slice*>(node);
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    size_t arg_rank = args[0].get_shape().size();
@@ -1286,7 +1194,7 @@ void runtime::cpu::CPU_Emitter::EmitSlice(codegen::CodeWriter& writer,
    // Scalar slice is necessarily just a copy.
    if (!strided && arg_rank == 0)
    {
-        writer << "{   // " << n->get_name() << " 1\n";
+        writer << "{   // " << node->get_name() << " 1\n";
        writer.indent++;
        writer << "memcpy(" << out[0].get_name() << ", " << args[0].get_name() << ", "
               << out[0].get_size() * out[0].get_element_type().size() << ");\n";
@@ -1295,7 +1203,7 @@ void runtime::cpu::CPU_Emitter::EmitSlice(codegen::CodeWriter& writer,
    }
    else if (!strided && arg_rank == 1)
    {
-        writer << "{   // " << n->get_name() << " 2\n";
+        writer << "{   // " << node->get_name() << " 2\n";
        writer.indent++;
        writer << emit_vector(out[0]) << " =\n"
               << "    " << emit_vector(args[0]) << ".segment(\n"
@@ -1306,7 +1214,7 @@ void runtime::cpu::CPU_Emitter::EmitSlice(codegen::CodeWriter& writer,
    }
    else if (!strided && arg_rank == 2)
    {
-        writer << "{   // " << n->get_name() << " 3\n";
+        writer << "{   // " << node->get_name() << " 3\n";
        writer.indent++;
        writer << emit_matrix(out[0]) << " = \n"
               << "        " << emit_matrix(args[0]) << ".block(" << to_string(lower_bounds[0])
@@ -1342,13 +1250,10 @@ void runtime::cpu::CPU_Emitter::EmitSlice(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitSum(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitSum)
-                                        const ngraph::Node* n,
-                                        const vector<runtime::cpu::TensorViewWrapper>& args,
-                                        const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    const op::Sum* sum = static_cast<const op::Sum*>(n);
+    const op::Sum* sum = static_cast<const op::Sum*>(node);
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    const Shape& arg_shape = args[0].get_shape();
@@ -1358,7 +1263,7 @@ void runtime::cpu::CPU_Emitter::EmitSum(codegen::CodeWriter& writer,
    // Trivial case: no reduction axes.
    if (reduction_axes.size() == 0)
    {
-        writer << "{   // " << n->get_name() << "\n";
+        writer << "{   // " << node->get_name() << "\n";
        writer.indent++;
        writer << "memcpy(" << out[0].get_name() << ", " << args[0].get_name() << ", "
               << out[0].get_size() * out[0].get_element_type().size() << ");\n";
@@ -1369,7 +1274,7 @@ void runtime::cpu::CPU_Emitter::EmitSum(codegen::CodeWriter& writer,
    else if ((arg_rank == 1 && reduction_axes == AxisSet{0}) ||
             (arg_rank == 2 && reduction_axes == AxisSet{0, 1}))
    {
-        writer << "{   // " << n->get_name() << "\n";
+        writer << "{   // " << node->get_name() << "\n";
        writer.indent++;
        writer << emit_array1d(out[0]) << " =\n"
               << "    " << emit_array1d(args[0]) << ".sum();\n";
@@ -1378,7 +1283,7 @@ void runtime::cpu::CPU_Emitter::EmitSum(codegen::CodeWriter& writer,
    }
    else if (arg_rank == 2 && reduction_axes == AxisSet{1})
    {
-        writer << "{   // " << n->get_name() << "\n";
+        writer << "{   // " << node->get_name() << "\n";
        writer.indent++;
        writer << emit_vector(out[0]) << " =\n"
               << "    " << emit_matrix(args[0]) << ".rowwise().sum();\n";
@@ -1387,7 +1292,7 @@ void runtime::cpu::CPU_Emitter::EmitSum(codegen::CodeWriter& writer,
    }
    else if (arg_rank == 2 && reduction_axes == AxisSet{0})
    {
-        writer << "{   // " << n->get_name() << "\n";
+        writer << "{   // " << node->get_name() << "\n";
        writer.indent++;
        writer << emit_vector(out[0]) << " =\n"
               << "    " << emit_matrix(args[0]) << ".colwise().sum();\n";
@@ -1415,12 +1320,9 @@ void runtime::cpu::CPU_Emitter::EmitSum(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitExp(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitExp)
-                                        const ngraph::Node* n,
-                                        const vector<runtime::cpu::TensorViewWrapper>& args,
-                                        const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -1436,12 +1338,9 @@ void runtime::cpu::CPU_Emitter::EmitExp(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitSin(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitSin)
-                                        const ngraph::Node* n,
-                                        const vector<runtime::cpu::TensorViewWrapper>& args,
-                                        const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -1457,12 +1356,9 @@ void runtime::cpu::CPU_Emitter::EmitSin(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitSinh(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitSinh)
-                                         const ngraph::Node* n,
-                                         const vector<runtime::cpu::TensorViewWrapper>& args,
-                                         const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -1478,12 +1374,9 @@ void runtime::cpu::CPU_Emitter::EmitSinh(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitCos(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitCos)
-                                        const ngraph::Node* n,
-                                        const vector<runtime::cpu::TensorViewWrapper>& args,
-                                        const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -1499,12 +1392,9 @@ void runtime::cpu::CPU_Emitter::EmitCos(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitCosh(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitCosh)
-                                         const ngraph::Node* n,
-                                         const vector<runtime::cpu::TensorViewWrapper>& args,
-                                         const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -1520,12 +1410,9 @@ void runtime::cpu::CPU_Emitter::EmitCosh(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitTan(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitTan)
-                                        const ngraph::Node* n,
-                                        const vector<runtime::cpu::TensorViewWrapper>& args,
-                                        const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -1541,16 +1428,13 @@ void runtime::cpu::CPU_Emitter::EmitTan(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitTanh(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitTanh)
-                                         const ngraph::Node* n,
-                                         const vector<runtime::cpu::TensorViewWrapper>& args,
-                                         const vector<runtime::cpu::TensorViewWrapper>& out)
 {
    // Eigen's generic_fast_tanh_float<float> is currently miscompiled by Clang/LLVM
    // so we fall-back to tanh
    // TODO: Implement our own internal fast/approximate tanh if this actually gets used
    // by models
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 0
    writer << "#pragma omp parallel for\n";
@@ -1563,12 +1447,9 @@ void runtime::cpu::CPU_Emitter::EmitTanh(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitAsin(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitAsin)
-                                         const ngraph::Node* n,
-                                         const vector<runtime::cpu::TensorViewWrapper>& args,
-                                         const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -1584,12 +1465,9 @@ void runtime::cpu::CPU_Emitter::EmitAsin(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitAcos(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitAcos)
-                                         const ngraph::Node* n,
-                                         const vector<runtime::cpu::TensorViewWrapper>& args,
-                                         const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -1605,12 +1483,9 @@ void runtime::cpu::CPU_Emitter::EmitAcos(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitAtan(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitAtan)
-                                         const ngraph::Node* n,
-                                         const vector<runtime::cpu::TensorViewWrapper>& args,
-                                         const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " =\n"
@@ -1626,12 +1501,9 @@ void runtime::cpu::CPU_Emitter::EmitAtan(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitPower(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitPower)
-                                          const ngraph::Node* n,
-                                          const vector<runtime::cpu::TensorViewWrapper>& args,
-                                          const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    writer << emit_array1d(out[0]) << " = \n";
@@ -1651,14 +1523,10 @@ void runtime::cpu::CPU_Emitter::EmitPower(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitReplaceSlice(
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitReplaceSlice)
-    codegen::CodeWriter& writer,
-    const ngraph::Node* n,
-    const vector<runtime::cpu::TensorViewWrapper>& args,
-    const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    auto replace_slice = static_cast<const op::Slice*>(n);
+    auto replace_slice = static_cast<const op::Slice*>(node);
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
 #if PREFER_EIGEN == 1
    size_t arg0_rank = args[0].get_shape().size();
@@ -1679,7 +1547,7 @@ void runtime::cpu::CPU_Emitter::EmitReplaceSlice(
    // Scalar slice is necessarily just a copy.
    if (!strided && arg0_rank == 0)
    {
-        writer << "{   // " << n->get_name() << " 1\n";
+        writer << "{   // " << node->get_name() << " 1\n";
        writer.indent++;
        writer << "memcpy(" << out[0].get_name() << ", " << args[1].get_name() << ", "
               << out[0].get_size() * out[0].get_element_type().size() << ");\n";
@@ -1688,7 +1556,7 @@ void runtime::cpu::CPU_Emitter::EmitReplaceSlice(
    }
    else if (!strided && arg0_rank == 1)
    {
-        writer << "{   // " << n->get_name() << " 2\n";
+        writer << "{   // " << node->get_name() << " 2\n";
        writer.indent++;
        writer << emit_vector(out[0]) << " =\n"
               << "    " << emit_vector(args[0]) << ";\n"
@@ -1701,7 +1569,7 @@ void runtime::cpu::CPU_Emitter::EmitReplaceSlice(
    }
    else if (!strided && arg0_rank == 2)
    {
-        writer << "{   // " << n->get_name() << " 3\n";
+        writer << "{   // " << node->get_name() << " 3\n";
        writer.indent++;
        writer << emit_matrix(out[0]) << " =\n"
               << "    " << emit_matrix(args[0]) << ";\n"
@@ -1743,12 +1611,9 @@ void runtime::cpu::CPU_Emitter::EmitReplaceSlice(
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitOneHot(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitOneHot)
-                                           const ngraph::Node* n,
-                                           const vector<runtime::cpu::TensorViewWrapper>& args,
-                                           const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    auto oh = static_cast<const op::OneHot*>(n);
+    auto oh = static_cast<const op::OneHot*>(node);
    auto arg_rank = args[0].get_shape().size();
@@ -1756,7 +1621,7 @@ void runtime::cpu::CPU_Emitter::EmitOneHot(codegen::CodeWriter& writer,
    if (arg_rank == 0)
    {
-        writer << "{   // " << n->get_name() << " 1\n";
+        writer << "{   // " << node->get_name() << " 1\n";
        writer.indent++;
        writer << emit_vector(out[0], "out_vector") << ";\n";
@@ -1787,7 +1652,7 @@ void runtime::cpu::CPU_Emitter::EmitOneHot(codegen::CodeWriter& writer,
    }
    else if (arg_rank == 1)
    {
-        writer << "{   // " << n->get_name() << " 1\n";
+        writer << "{   // " << node->get_name() << " 1\n";
        writer.indent++;
        writer << emit_vector(args[0], "arg_vector") << ";\n";
@@ -1838,12 +1703,9 @@ void runtime::cpu::CPU_Emitter::EmitOneHot(codegen::CodeWriter& writer,
    }
 }
-void runtime::cpu::CPU_Emitter::EmitCeiling(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitCeiling)
-                                            const ngraph::Node* n,
-                                            const vector<runtime::cpu::TensorViewWrapper>& args,
-                                            const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
    size_t element_count = out[0].get_size();
 #if PREFER_EIGEN == 0
@@ -1857,12 +1719,9 @@ void runtime::cpu::CPU_Emitter::EmitCeiling(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitFloor(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitFloor)
-                                          const ngraph::Node* n,
-                                          const vector<runtime::cpu::TensorViewWrapper>& args,
-                                          const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
    size_t element_count = out[0].get_size();
 #if PREFER_EIGEN == 0
@@ -1876,12 +1735,9 @@ void runtime::cpu::CPU_Emitter::EmitFloor(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitSqrt(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitSqrt)
-                                         const ngraph::Node* n,
-                                         const vector<runtime::cpu::TensorViewWrapper>& args,
-                                         const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
    size_t element_count = out[0].get_size();
 #if PREFER_EIGEN == 0
@@ -1895,12 +1751,9 @@ void runtime::cpu::CPU_Emitter::EmitSqrt(codegen::CodeWriter& writer,
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitConvolution(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitConvolution)
-                                                const ngraph::Node* n,
-                                                const vector<runtime::cpu::TensorViewWrapper>& args,
-                                                const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    auto convolution = static_cast<const op::Convolution*>(n);
+    auto convolution = static_cast<const op::Convolution*>(node);
    auto arg0_shape = args[0].get_shape();
    auto arg1_shape = args[1].get_shape();
@@ -2025,13 +1878,9 @@ void runtime::cpu::CPU_Emitter::EmitConvolution(codegen::CodeWriter& writer,
    }
 }
-void runtime::cpu::CPU_Emitter::EmitConvolutionBackpropFilters(
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitConvolutionBackpropFilters)
-    codegen::CodeWriter& writer,
-    const ngraph::Node* n,
-    const vector<runtime::cpu::TensorViewWrapper>& args,
-    const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    auto convolution = static_cast<const op::ConvolutionBackpropFilters*>(n);
+    auto convolution = static_cast<const op::ConvolutionBackpropFilters*>(node);
    auto arg0_shape = args[0].get_shape();
    auto arg1_shape = args[1].get_shape();
@@ -2136,13 +1985,9 @@ void runtime::cpu::CPU_Emitter::EmitConvolutionBackpropFilters(
    }
 }
-void runtime::cpu::CPU_Emitter::EmitConvolutionBackpropData(
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitConvolutionBackpropData)
-    codegen::CodeWriter& writer,
-    const ngraph::Node* n,
-    const vector<runtime::cpu::TensorViewWrapper>& args,
-    const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    auto convolution = static_cast<const op::ConvolutionBackpropData*>(n);
+    auto convolution = static_cast<const op::ConvolutionBackpropData*>(node);
    auto arg0_shape = args[0].get_shape();
    auto arg1_shape = args[1].get_shape();
@@ -2246,22 +2091,16 @@ void runtime::cpu::CPU_Emitter::EmitConvolutionBackpropData(
    }
 }
-void runtime::cpu::CPU_Emitter::EmitNot(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitNot)
-                                        const ngraph::Node* n,
-                                        const vector<runtime::cpu::TensorViewWrapper>& args,
-                                        const vector<runtime::cpu::TensorViewWrapper>& out)
 {
    writer << "kernel::logical_not(" << args[0].get_name() << ",\n"
           << "                    " << out[0].get_name() << ",\n"
           << "                    " << out[0].get_size() << ");\n";
 }
-void runtime::cpu::CPU_Emitter::EmitMaxPool(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitMaxPool)
-                                            const ngraph::Node* n,
-                                            const vector<runtime::cpu::TensorViewWrapper>& args,
-                                            const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    auto max_pool = static_cast<const op::MaxPool*>(n);
+    auto max_pool = static_cast<const op::MaxPool*>(node);
    auto arg_shape = args[0].get_shape();
    auto arg_rank = arg_shape.size();
@@ -2318,12 +2157,9 @@ void runtime::cpu::CPU_Emitter::EmitMaxPool(codegen::CodeWriter& writer,
    }
 }
-void runtime::cpu::CPU_Emitter::EmitReverse(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitReverse)
-                                            const ngraph::Node* n,
-                                            const vector<runtime::cpu::TensorViewWrapper>& args,
-                                            const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    auto reverse = static_cast<const op::Reverse*>(n);
+    auto reverse = static_cast<const op::Reverse*>(node);
    auto arg_shape = args[0].get_shape();
    auto result_shape = out[0].get_shape();
@@ -2335,20 +2171,16 @@ void runtime::cpu::CPU_Emitter::EmitReverse(codegen::CodeWriter& writer,
    writer << "                {" << join(reverse->get_reversed_axes()) << "});\n";
 }
-void runtime::cpu::CPU_Emitter::EmitReduceWindow(
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitReduceWindow)
-    codegen::CodeWriter& writer,
-    const ngraph::Node* n,
-    const vector<runtime::cpu::TensorViewWrapper>& args,
-    const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    auto reduce_window = static_cast<const op::ReduceWindow*>(n);
+    auto reduce_window = static_cast<const op::ReduceWindow*>(node);
    auto arg_reductee_shape = args[0].get_shape();
    auto result_shape = out[0].get_shape();
    auto reduction_function = reduce_window->get_functions()[0];
    auto& f_result_element_type = out[0].get_element_type();
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
    string type = f_result_element_type.c_type_string();
@@ -2377,13 +2209,9 @@ void runtime::cpu::CPU_Emitter::EmitReduceWindow(
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitSelectAndScatter(
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitSelectAndScatter)
-    codegen::CodeWriter& writer,
-    const ngraph::Node* n,
-    const vector<runtime::cpu::TensorViewWrapper>& args,
-    const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    auto select_and_scatter = static_cast<const op::SelectAndScatter*>(n);
+    auto select_and_scatter = static_cast<const op::SelectAndScatter*>(node);
    auto selection_function = select_and_scatter->get_functions()[0];
    auto scatter_function = select_and_scatter->get_functions()[1];
@@ -2391,10 +2219,10 @@ void runtime::cpu::CPU_Emitter::EmitSelectAndScatter(
    auto arg1_shape = args[1].get_shape();
    auto result_shape = out[0].get_shape();
-    writer << "{   // " << n->get_name() << "\n";
+    writer << "{   // " << node->get_name() << "\n";
    writer.indent++;
-    string type = n->get_output_element_type(0).c_type_string();
+    string type = node->get_output_element_type(0).c_type_string();
    writer << "auto f_select = [&](" << type << " x, " << type << " y) -> char\n{";
    writer.indent++;
@@ -2436,12 +2264,9 @@ void runtime::cpu::CPU_Emitter::EmitSelectAndScatter(
    writer << "}\n";
 }
-void runtime::cpu::CPU_Emitter::EmitAvgPool(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitAvgPool)
-                                            const ngraph::Node* n,
-                                            const vector<runtime::cpu::TensorViewWrapper>& args,
-                                            const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    auto avg_pool = static_cast<const op::AvgPool*>(n);
+    auto avg_pool = static_cast<const op::AvgPool*>(node);
    auto arg_shape = args[0].get_shape();
    auto arg_rank = arg_shape.size();
@@ -2501,12 +2326,9 @@ void runtime::cpu::CPU_Emitter::EmitAvgPool(codegen::CodeWriter& writer,
    }
 }
-void runtime::cpu::CPU_Emitter::EmitPad(codegen::CodeWriter& writer,
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitPad)
-                                        const ngraph::Node* n,
-                                        const vector<runtime::cpu::TensorViewWrapper>& args,
-                                        const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    auto pad = static_cast<const op::Pad*>(n);
+    auto pad = static_cast<const op::Pad*>(node);
    auto arg0_shape = args[0].get_shape();
    auto result_shape = out[0].get_shape();
@@ -2521,13 +2343,9 @@ void runtime::cpu::CPU_Emitter::EmitPad(codegen::CodeWriter& writer,
    writer << "            {" << join(pad->get_padding_interior()) << "});\n";
 }
-void runtime::cpu::CPU_Emitter::EmitAvgPoolBackprop(
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitAvgPoolBackprop)
-    codegen::CodeWriter& writer,
-    const ngraph::Node* n,
-    const vector<runtime::cpu::TensorViewWrapper>& args,
-    const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    auto apb = static_cast<const op::AvgPoolBackprop*>(n);
+    auto apb = static_cast<const op::AvgPoolBackprop*>(node);
    auto delta_shape = args[0].get_shape();
    auto delta_rank = delta_shape.size();
@@ -2586,13 +2404,9 @@ void runtime::cpu::CPU_Emitter::EmitAvgPoolBackprop(
    }
 }
-void runtime::cpu::CPU_Emitter::EmitMaxPoolBackprop(
+void runtime::cpu::CPU_Emitter::EMITTER_DECL(EmitMaxPoolBackprop)
-    codegen::CodeWriter& writer,
-    const ngraph::Node* n,
-    const vector<runtime::cpu::TensorViewWrapper>& args,
-    const vector<runtime::cpu::TensorViewWrapper>& out)
 {
-    auto mpb = static_cast<const op::MaxPoolBackprop*>(n);
+    auto mpb = static_cast<const op::MaxPoolBackprop*>(node);
    auto delta_shape = args[1].get_shape();
    auto out_shape = out[0].get_shape();

--- a/src/ngraph/runtime/cpu/cpu_emitter.hpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.hpp
@@ -25,8 +25,9 @@
 #include "ngraph/runtime/cpu/cpu_tensor_view_wrapper.hpp"
 #define EMITTER_DECL(E)                                                                            \
-    E(codegen::CodeWriter& writer,                                                                 \
+    E(ngraph::runtime::cpu::CPU_ExternalFunction* external_function,                               \
-      const ngraph::Node* n,                                                                       \
+      codegen::CodeWriter& writer,                                                                 \
+      const ngraph::Node* node,                                                                    \
      const std::vector<ngraph::runtime::cpu::TensorViewWrapper>& args,                            \
      const std::vector<ngraph::runtime::cpu::TensorViewWrapper>& out)

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -27,7 +27,6 @@
 #include "ngraph/codegen/compiler.hpp"
 #include "ngraph/codegen/execution_engine.hpp"
 #include "ngraph/descriptor/input.hpp"
-#include "ngraph/descriptor/layout/dense_tensor_view_layout.hpp"
 #include "ngraph/descriptor/output.hpp"
 #include "ngraph/descriptor/primary_tensor_view.hpp"
 #include "ngraph/file_util.hpp"
@@ -85,7 +84,6 @@
 #include "ngraph/ops/sum.hpp"
 #include "ngraph/ops/tan.hpp"
 #include "ngraph/ops/tanh.hpp"
-#include "ngraph/pass/assign_layout.hpp"
 #include "ngraph/pass/dump_sorted.hpp"
 #include "ngraph/pass/liveness.hpp"
 #include "ngraph/pass/manager.hpp"
@@ -94,16 +92,21 @@
 #include "ngraph/runtime/cpu/cpu_call_frame.hpp"
 #include "ngraph/runtime/cpu/cpu_emitter.hpp"
 #include "ngraph/runtime/cpu/cpu_external_function.hpp"
+#include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
 #include "ngraph/runtime/cpu/cpu_tracing.hpp"
+#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
 #include "ngraph/runtime/cpu/ops/matmul_bias.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_fusion.hpp"
-#include "ngraph/runtime/host_tensor_view.hpp"
+#include "ngraph/runtime/cpu/pass/cpu_layout.hpp"
 using namespace std;
 using namespace ngraph;
 static const string s_output_dir = "cpu_codegen";
+// Temporary Memory Pool alignment
+static const size_t s_memory_pool_alignment = 4096;
 class StaticInitializers
 {
 public:
@@ -228,12 +231,13 @@ void runtime::cpu::CPU_ExternalFunction::compile()
    m_mkldnn_emitter.reset(new MKLDNNEmitter(shared_from_this()));
-    pass::Manager pass_manager;
+    ngraph::pass::Manager pass_manager;
-    // For now, just make everyone row-major.
-    pass_manager.register_pass<pass::CPUFusion>();
+    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
-    pass_manager.register_pass<pass::AssignLayout<descriptor::layout::DenseTensorViewLayout>>();
+    pass_manager.register_pass<runtime::cpu::pass::CPULayout>();
-    pass_manager.register_pass<pass::Liveness>();
+    pass_manager.register_pass<ngraph::pass::Liveness>();
-    pass_manager.register_pass<pass::MemoryLayout>(64);
+    pass_manager.register_pass<ngraph::pass::MemoryLayout>(s_memory_pool_alignment);
    pass_manager.run_passes(m_function);
    codegen::CodeWriter writer;
@@ -243,11 +247,7 @@ void runtime::cpu::CPU_ExternalFunction::compile()
    {
        for (shared_ptr<Node> node : current_function->get_ordered_ops())
        {
-            if (dynamic_cast<op::Convolution*>(node.get()) ||
+            if (ngraph::runtime::cpu::mkldnn_utils::IsMKLDNNOp(*node))
-                dynamic_cast<op::ConvolutionBackpropData*>(node.get()) ||
-                dynamic_cast<op::ConvolutionBackpropFilters*>(node.get()) ||
-                dynamic_cast<op::AvgPool*>(node.get()) || dynamic_cast<op::MaxPool*>(node.get()) ||
-                dynamic_cast<op::AvgPoolBackprop*>(node.get()))
            {
                include_mkldnn_headers = true;
            }
@@ -522,7 +522,7 @@ using namespace ngraph::runtime;
            writer << "// Memory pool size is " << temp_pool_size << " bytes\n";
            writer << "// Worst case size is " << worst_case_tmp_size << " bytes\n";
            writer << "ngraph::runtime::AlignedBuffer memory_handler(" << temp_pool_size << ", "
-                   << ngraph::runtime::alignment << ");\n";
+                   << s_memory_pool_alignment << ");\n";
            writer << "size_t pool_base_ptr = (size_t)memory_handler.get_ptr();\n";
            writer << "\n";
@@ -679,7 +679,7 @@ using namespace ngraph::runtime;
            }
            if (func_name.empty())
            {
-                handler->second(writer, node.get(), in, out);
+                handler->second(this, writer, node.get(), in, out);
            }
            else
            {
@@ -764,6 +764,41 @@ using namespace ngraph::runtime;
        writer += "}\n\n";
    }
+    // Store layouts assigned for arguments
+    for (const auto& parameter : m_function->get_parameters())
+    {
+        for (size_t i = 0; i < parameter->get_output_size(); ++i)
+        {
+            auto tv = parameter->get_output_tensor_view(i);
+            if (tv->get_tensor_view_layout() == nullptr)
+            {
+                throw ngraph_error("layout missing on function parameter's tensor view: " +
+                                   tv->get_name());
+            }
+            parameter_layout_descriptors.emplace_back(
+                static_pointer_cast<runtime::cpu::LayoutDescriptor>(tv->get_tensor_view_layout()));
+        }
+    }
+    // Store layouts assigned for results
+    if (!result_layout_descriptors.empty())
+    {
+        throw ngraph_error("Function output layouts should not be pre-assigned");
+    }
+    for (size_t i = 0; i < m_function->get_output_size(); ++i)
+    {
+        const auto& output = m_function->get_output_op(i);
+        for (size_t j = 0; j < output->get_output_size(); ++j)
+        {
+            auto tv = output->get_output_tensor_view(j);
+            if (tv->get_tensor_view_layout() == nullptr)
+            {
+                throw ngraph_error("layout missing on function output tensor: " + tv->get_name());
+            }
+            result_layout_descriptors.emplace_back(
+                static_pointer_cast<runtime::cpu::LayoutDescriptor>(tv->get_tensor_view_layout()));
+        }
+    }
    // TODO: Cleanup and make this a utility function
    file_util::make_directory(s_output_dir);
@@ -836,6 +871,18 @@ shared_ptr<ngraph::runtime::CallFrame> runtime::cpu::CPU_ExternalFunction::make_
                                                            m_compiled_function);
 }
+const runtime::cpu::LayoutDescriptorPtrs&
+    runtime::cpu::CPU_ExternalFunction::get_parameter_layout_descriptors()
+{
+    return parameter_layout_descriptors;
+}
+const runtime::cpu::LayoutDescriptorPtrs&
+    runtime::cpu::CPU_ExternalFunction::get_result_layout_descriptors()
+{
+    return result_layout_descriptors;
+}
 void runtime::cpu::CPU_ExternalFunction::emit_debug_function_entry(
    codegen::CodeWriter& writer,
    Node* node,
@@ -907,7 +954,7 @@ string runtime::cpu::CPU_ExternalFunction::emit_op_as_function(const Node& node,
    writer << "\n)\n";
    writer << "{\n";
    writer.indent++;
-    handler->second(writer, &node, in, out);
+    handler->second(this, writer, &node, in, out);
    writer.indent--;
    writer << "}\n";

--- a/src/ngraph/runtime/cpu/cpu_external_function.hpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.hpp
@@ -29,6 +29,7 @@
 #include "ngraph/codegen/execution_engine.hpp"
 #include "ngraph/function.hpp"
 #include "ngraph/runtime/cpu/cpu_call_frame.hpp"
+#include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
 #include "ngraph/runtime/cpu/cpu_tensor_view_wrapper.hpp"
 #include "ngraph/runtime/cpu/mkldnn_emitter.hpp"
 #include "ngraph/runtime/external_function.hpp"
@@ -43,7 +44,8 @@ namespace ngraph
            class CPU_Emitter;
            class CPU_CallFrame;
-            using OpFunction = std::function<void(codegen::CodeWriter&,
+            using OpFunction = std::function<void(CPU_ExternalFunction* external_function,
+                                                  codegen::CodeWriter&,
                                                  const ngraph::Node*,
                                                  const std::vector<TensorViewWrapper>& inputs,
                                                  const std::vector<TensorViewWrapper>& outputs)>;
@@ -75,6 +77,9 @@ namespace ngraph
                                     bool release_function = true);
                std::shared_ptr<ngraph::runtime::CallFrame> make_call_frame();
+                const LayoutDescriptorPtrs& get_parameter_layout_descriptors();
+                const LayoutDescriptorPtrs& get_result_layout_descriptors();
                const std::vector<OpAttributes>& get_op_attrs() const { return m_op_attrs; }
                const std::unique_ptr<MKLDNNEmitter>& get_mkldnn_emitter() const
                {
@@ -112,6 +117,9 @@ namespace ngraph
                bool m_emit_timing;
                bool m_use_tbb;
                std::unordered_map<std::string, std::string> m_variable_name_map;
+                LayoutDescriptorPtrs parameter_layout_descriptors;
+                LayoutDescriptorPtrs result_layout_descriptors;
                std::vector<OpAttributes> m_op_attrs;
                std::unique_ptr<MKLDNNEmitter> m_mkldnn_emitter;

--- a/src/ngraph/runtime/cpu/cpu_layout_descriptor.cpp
+++ b/src/ngraph/runtime/cpu/cpu_layout_descriptor.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <algorithm>
+#include "cpu_layout_descriptor.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            const AxisVector LayoutDescriptor::Native2DAxisOrder{0, 1};
+            const AxisVector LayoutDescriptor::Native4DAxisOrder{0, 1, 2, 3};
+            const AxisVector LayoutDescriptor::CHWNAxisOrder{1, 2, 3, 0};
+            AxisVector LayoutDescriptor::create_native_axis_order(size_t rank)
+            {
+                AxisVector native_axis_order(rank);
+                std::iota(native_axis_order.begin(), native_axis_order.end(), 0);
+                return native_axis_order;
+            }
+            LayoutDescriptor::LayoutDescriptor(const ngraph::descriptor::TensorView& tv,
+                                               const AxisVector& tv_axis_order)
+                : TensorViewLayout(tv)
+                , axis_order(tv_axis_order)
+                , offset(0)
+                , size(ngraph::shape_size(tv.get_tensor_view_type()->get_shape()))
+                , mkldnn_format(mkldnn::memory::format::format_undef)
+            {
+                auto shape = get_shape();
+                size_t s = 1;
+                if (tv_axis_order.size() != shape.size())
+                {
+                    throw ngraph_error("Axis order is incomplete");
+                }
+                for (auto it = tv_axis_order.crbegin(); it != tv_axis_order.crend(); it++)
+                {
+                    if (*it >= shape.size())
+                    {
+                        throw ngraph_error("Axis is out of bounds");
+                    }
+                    strides.emplace_back(s);
+                    s *= shape[*it];
+                }
+                std::reverse(strides.begin(), strides.end());
+            }
+            size_t LayoutDescriptor::get_index_offset(const std::vector<size_t>& indices)
+            {
+                if (indices.size() != strides.size())
+                {
+                    throw ngraph_error("Indices have incorrect rank");
+                }
+                size_t result = 0;
+                for (int i = 0; i < indices.size(); i++)
+                {
+                    result += strides[i] + indices[i];
+                }
+                return result;
+            }
+            bool LayoutDescriptor::
+                operator==(const ngraph::descriptor::layout::TensorViewLayout& other) const
+            {
+                const LayoutDescriptor* p_other = dynamic_cast<const LayoutDescriptor*>(&other);
+                if (!p_other)
+                {
+                    return false;
+                }
+                if (get_element_type() != p_other->get_element_type())
+                {
+                    return false;
+                }
+                if (strides != p_other->strides)
+                {
+                    return false;
+                }
+                if (offset != p_other->offset)
+                {
+                    return false;
+                }
+                return true;
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/cpu_layout_descriptor.hpp
+++ b/src/ngraph/runtime/cpu/cpu_layout_descriptor.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include <cstdint>
+#include <memory>
+#include <vector>
+#include <mkldnn.hpp>
+#include "ngraph/common.hpp"
+#include "ngraph/descriptor/layout/tensor_view_layout.hpp"
+#include "ngraph/shape.hpp"
+#include "ngraph/types/type.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            class LayoutDescriptor : public ngraph::descriptor::layout::TensorViewLayout
+            {
+            public:
+                LayoutDescriptor(const ngraph::descriptor::TensorView& tv,
+                                 const AxisVector& tv_axis_order);
+                ~LayoutDescriptor() {}
+                size_t get_size() override { return size; }
+                size_t get_offset() const { return offset; }
+                size_t get_index_offset(const std::vector<size_t>& indices) override;
+                const Strides& get_strides() const override { return strides; }
+                bool operator==(const TensorViewLayout& other) const override;
+                void set_mkldnn_format(const mkldnn::memory::format& format)
+                {
+                    mkldnn_format = format;
+                }
+                mkldnn::memory::format get_mkldnn_format() const { return mkldnn_format; }
+                const AxisVector& get_axis_order() const { return axis_order; }
+                static const AxisVector Native2DAxisOrder;
+                static const AxisVector Native4DAxisOrder;
+                static const AxisVector CHWNAxisOrder;
+                static AxisVector create_native_axis_order(size_t rank);
+            private:
+                AxisVector axis_order;
+                Strides strides;
+                size_t offset;
+                size_t size;
+                // Numeric backend-specific fields
+                mkldnn::memory::format mkldnn_format;
+            };
+            typedef std::vector<std::shared_ptr<ngraph::runtime::cpu::LayoutDescriptor>>
+                LayoutDescriptorPtrs;
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/cpu_tensor_view.cpp
+++ b/src/ngraph/runtime/cpu/cpu_tensor_view.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <cstring>
+#include <memory>
+#include "cpu_tensor_view.hpp"
+#include "ngraph/descriptor/layout/tensor_view_layout.hpp"
+#include "ngraph/descriptor/primary_tensor_view.hpp"
+#include "ngraph/except.hpp"
+#include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
+#include "ngraph/shape.hpp"
+using namespace ngraph;
+using namespace std;
+// TODO(jmenon): Refactor all the alignment specifications into
+// a single place and allow lower or no alignment when possible
+const size_t runtime::cpu::CPUTensorView::BufferAlignment = 64;
+runtime::cpu::CPUTensorView::CPUTensorView(const ngraph::element::Type& element_type,
+                                           const Shape& shape,
+                                           const string& name)
+    : runtime::TensorView(std::make_shared<ngraph::descriptor::PrimaryTensorView>(
+          std::make_shared<ngraph::TensorViewType>(element_type, shape), name, true, true, false))
+    , buffer(nullptr)
+    , aligned_buffer(nullptr)
+{
+    // TODO(jmenon): A fallback layout should not be needed but is required
+    // because of how some unit test functionality is written (ex. 'backprop_derivative')
+    // This needs to be removed
+    m_descriptor->set_tensor_view_layout(std::make_shared<runtime::cpu::LayoutDescriptor>(
+        *m_descriptor, runtime::cpu::LayoutDescriptor::create_native_axis_order(shape.size())));
+    buffer_size = shape_size(shape) * element_type.size();
+    if (buffer_size)
+    {
+        size_t allocation_size = buffer_size + BufferAlignment;
+        auto ptr = malloc(allocation_size);
+        if (!ptr)
+        {
+            throw ngraph_error("Error allocating CPU Tensor View memory");
+        }
+        buffer = static_cast<char*>(ptr);
+        std::align(BufferAlignment, buffer_size, ptr, allocation_size);
+        aligned_buffer = static_cast<char*>(ptr);
+    }
+}
+runtime::cpu::CPUTensorView::~CPUTensorView()
+{
+    free(buffer);
+}
+char* runtime::cpu::CPUTensorView::get_data_ptr()
+{
+    return aligned_buffer;
+}
+const char* runtime::cpu::CPUTensorView::get_data_ptr() const
+{
+    return aligned_buffer;
+}
+void runtime::cpu::CPUTensorView::write(const void* source, size_t tensor_offset, size_t n)
+{
+    if (tensor_offset + n > buffer_size)
+    {
+        throw out_of_range("write access past end of tensor");
+    }
+    char* target = get_data_ptr();
+    memcpy(&target[tensor_offset], source, n);
+}
+void runtime::cpu::CPUTensorView::read(void* target, size_t tensor_offset, size_t n) const
+{
+    if (tensor_offset + n > buffer_size)
+    {
+        throw out_of_range("read access past end of tensor");
+    }
+    const char* source = get_data_ptr();
+    memcpy(target, &source[tensor_offset], n);
+}
+size_t runtime::cpu::CPUTensorView::get_size() const
+{
+    return get_element_count();
+}
--- a/src/ngraph/runtime/cpu/cpu_tensor_view.hpp
+++ b/src/ngraph/runtime/cpu/cpu_tensor_view.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include <string>
+#include "ngraph/runtime/tensor_view.hpp"
+#include "ngraph/types/element_type.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            class CPUTensorView : public ngraph::runtime::TensorView
+            {
+            public:
+                CPUTensorView(const ngraph::element::Type& element_type,
+                              const Shape& shape,
+                              const std::string& name = "external");
+                virtual ~CPUTensorView();
+                char* get_data_ptr();
+                const char* get_data_ptr() const;
+                size_t get_size() const;
+                const element::Type& get_element_type() const;
+                /// @brief Write bytes directly into the tensor
+                /// @param p Pointer to source of data
+                /// @param tensor_offset Offset into tensor storage to begin writing. Must be element-aligned.
+                /// @param n Number of bytes to write, must be integral number of elements.
+                void write(const void* p, size_t tensor_offset, size_t n) override;
+                /// @brief Read bytes directly from the tensor
+                /// @param p Pointer to destination for data
+                /// @param tensor_offset Offset into tensor storage to begin reading. Must be element-aligned.
+                /// @param n Number of bytes to read, must be integral number of elements.
+                void read(void* p, size_t tensor_offset, size_t n) const override;
+            private:
+                static const size_t BufferAlignment;
+                char* buffer;
+                char* aligned_buffer;
+                size_t buffer_size;
+            };
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/cpu_tracing.cpp
+++ b/src/ngraph/runtime/cpu/cpu_tracing.cpp
-// ----------------------------------------------------------------------------
+/*******************************************************************************
-// Copyright 2018 Nervana Systems Inc.
+* Copyright 2017-2018 Intel Corporation
-// Licensed under the Apache License, Version 2.0 (the "License");
+*
-// you may not use this file except in compliance with the License.
+* Licensed under the Apache License, Version 2.0 (the "License");
-// You may obtain a copy of the License at
+* you may not use this file except in compliance with the License.
-//
+* You may obtain a copy of the License at
-//      http://www.apache.org/licenses/LICENSE-2.0
+*
-//
+*     http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
+*
-// distributed under the License is distributed on an "AS IS" BASIS,
+* Unless required by applicable law or agreed to in writing, software
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* distributed under the License is distributed on an "AS IS" BASIS,
-// See the License for the specific language governing permissions and
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// ----------------------------------------------------------------------------
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
 #include <fstream>
 #include <map>

--- a/src/ngraph/runtime/cpu/cpu_tracing.hpp
+++ b/src/ngraph/runtime/cpu/cpu_tracing.hpp
-// ----------------------------------------------------------------------------
+/*******************************************************************************
-// Copyright 2018 Nervana Systems Inc.
+* Copyright 2017-2018 Intel Corporation
-// Licensed under the Apache License, Version 2.0 (the "License");
+*
-// you may not use this file except in compliance with the License.
+* Licensed under the Apache License, Version 2.0 (the "License");
-// You may obtain a copy of the License at
+* you may not use this file except in compliance with the License.
-//
+* You may obtain a copy of the License at
-//      http://www.apache.org/licenses/LICENSE-2.0
+*
-//
+*     http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
+*
-// distributed under the License is distributed on an "AS IS" BASIS,
+* Unless required by applicable law or agreed to in writing, software
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* distributed under the License is distributed on an "AS IS" BASIS,
-// See the License for the specific language governing permissions and
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// ----------------------------------------------------------------------------
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
 #pragma once

--- a/src/ngraph/runtime/cpu/mkldnn_utils.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <typeindex>
+#include <typeinfo>
+#include <unordered_set>
+#include "ngraph/node.hpp"
+#include "ngraph/ops/avg_pool.hpp"
+#include "ngraph/ops/convolution.hpp"
+#include "ngraph/ops/max_pool.hpp"
+#include "mkldnn_utils.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace mkldnn_utils
+            {
+#define TI(x) std::type_index(typeid(x))
+                const std::unordered_set<std::type_index> s_op_registry{
+                    TI(ngraph::op::AvgPool),
+                    TI(ngraph::op::AvgPoolBackprop),
+                    TI(ngraph::op::Convolution),
+                    TI(ngraph::op::ConvolutionBackpropData),
+                    TI(ngraph::op::ConvolutionBackpropFilters),
+                    TI(ngraph::op::MaxPool)};
+                bool IsMKLDNNOp(ngraph::Node& op)
+                {
+                    return (s_op_registry.find(TI(op)) != s_op_registry.end());
+                }
+                mkldnn::memory::format
+                    CreateNativeDataFormat(const ngraph::runtime::cpu::LayoutDescriptor& layout)
+                {
+                    switch (layout.get_shape().size())
+                    {
+                    case 1: return mkldnn::memory::format::x;
+                    case 2: return mkldnn::memory::format::nc;
+                    case 4: return mkldnn::memory::format::nchw;
+                    default: return mkldnn::memory::format::format_undef;
+                    }
+                }
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/mkldnn_utils.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.hpp
-// ----------------------------------------------------------------------------
+/*******************************************************************************
-// Copyright 2018 Nervana Systems Inc.
+* Copyright 2017-2018 Intel Corporation
-// Licensed under the Apache License, Version 2.0 (the "License");
+*
-// you may not use this file except in compliance with the License.
+* Licensed under the Apache License, Version 2.0 (the "License");
-// You may obtain a copy of the License at
+* you may not use this file except in compliance with the License.
-//
+* You may obtain a copy of the License at
-//      http://www.apache.org/licenses/LICENSE-2.0
+*
-//
+*     http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
+*
-// distributed under the License is distributed on an "AS IS" BASIS,
+* Unless required by applicable law or agreed to in writing, software
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* distributed under the License is distributed on an "AS IS" BASIS,
-// See the License for the specific language governing permissions and
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// ----------------------------------------------------------------------------
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
 #pragma once
+#include <typeindex>
+#include <typeinfo>
+#include <unordered_set>
 #include <mkldnn.hpp>
+#include "ngraph/node.hpp"
+#include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
 namespace ngraph
 {
    namespace runtime
@@ -25,6 +34,11 @@ namespace ngraph
            namespace mkldnn_utils
            {
                extern mkldnn::engine global_cpu_engine;
+                bool IsMKLDNNOp(ngraph::Node& op);
+                mkldnn::memory::format
+                    CreateNativeDataFormat(const ngraph::runtime::cpu::LayoutDescriptor& layout);
            }
        }
    }

--- a/src/ngraph/runtime/cpu/ops/convert_layout.cpp
+++ b/src/ngraph/runtime/cpu/ops/convert_layout.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "ngraph/runtime/cpu/ops/convert_layout.hpp"
+#include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
+using namespace std;
+using namespace ngraph;
+runtime::cpu::op::ConvertLayout::ConvertLayout(
+    const shared_ptr<Node>& arg, const shared_ptr<runtime::cpu::LayoutDescriptor>& layout)
+    : ConvertLayout(arg, 0, layout)
+{
+}
+runtime::cpu::op::ConvertLayout::ConvertLayout(
+    const shared_ptr<Node>& arg,
+    size_t output_index,
+    const shared_ptr<runtime::cpu::LayoutDescriptor>& layout)
+    : RequiresTensorViewArgs("ConvertLayout", {arg})
+    , arg_output_index(output_index)
+    , output_layout(layout)
+{
+    const auto& arg_tensor_view = arg->get_output_tensor_view(arg_output_index);
+    const auto& arg_layout = arg_tensor_view->get_tensor_view_layout();
+    if (!arg_layout)
+    {
+        throw ngraph_error("Layout conversion input tensor is missing layout information");
+    }
+    add_output(layout->get_element_type(), layout->get_shape());
+    get_output_tensor_view()->set_tensor_view_layout(layout);
+}
--- a/src/ngraph/runtime/cpu/ops/convert_layout.hpp
+++ b/src/ngraph/runtime/cpu/ops/convert_layout.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include "ngraph/ops/op.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            class LayoutDescriptor;
+            namespace op
+            {
+                /// \brief Layout Conversion
+                ///
+                /// Converts an input tensor to a tensor with the given layout descriptor
+                class ConvertLayout : public ngraph::op::RequiresTensorViewArgs
+                {
+                public:
+                    ConvertLayout(
+                        const std::shared_ptr<Node>& arg,
+                        const std::shared_ptr<ngraph::runtime::cpu::LayoutDescriptor>& layout);
+                    ConvertLayout(
+                        const std::shared_ptr<Node>& arg,
+                        size_t output_index,
+                        const std::shared_ptr<ngraph::runtime::cpu::LayoutDescriptor>& layout);
+                    virtual std::shared_ptr<Node> copy_with_new_args(
+                        const std::vector<std::shared_ptr<Node>>& new_args) const override
+                    {
+                        if (new_args.size() != 1)
+                            throw ngraph_error("Incorrect number of new arguments");
+                        return std::make_shared<ConvertLayout>(new_args.at(0), output_layout);
+                    }
+                protected:
+                    size_t arg_output_index;
+                    std::shared_ptr<ngraph::runtime::cpu::LayoutDescriptor> output_layout;
+                };
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
@@ -97,7 +97,7 @@ static std::vector<T> apply_permutation(std::vector<T> input, ngraph::AxisVector
    return output;
 }
-void ngraph::pass::CPUFusion::construct_gemm_pattern()
+void ngraph::runtime::cpu::pass::CPUFusion::construct_gemm_pattern()
 {
    Shape shape_w{2, 4};
    Shape shape_x{4, 1};

--- a/src/ngraph/runtime/cpu/pass/cpu_fusion.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_fusion.hpp
@@ -20,13 +20,19 @@
 namespace ngraph
 {
-    namespace pass
+    namespace runtime
    {
-        class CPUFusion;
+        namespace cpu
+        {
+            namespace pass
+            {
+                class CPUFusion;
+            }
+        }
    }
 }
-class ngraph::pass::CPUFusion : public ngraph::pass::GraphRewrite
+class ngraph::runtime::cpu::pass::CPUFusion : public ngraph::pass::GraphRewrite
 {
 public:
    CPUFusion()

--- a/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <algorithm>
+#include <memory>
+#include <mkldnn.hpp>
+#include "cpu_layout.hpp"
+#include "ngraph/descriptor/output.hpp"
+#include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
+#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+//using namespace ngraph::runtime::cpu::pass;
+using namespace ngraph;
+bool runtime::cpu::pass::CPULayout::run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes)
+{
+    for (const auto& node : nodes)
+    {
+        for (size_t i = 0; i < node->get_output_size(); ++i)
+        {
+            auto tv = node->get_output_tensor_view(i);
+            if (tv->get_tensor_view_layout())
+            {
+                continue;
+            }
+            auto tvt = tv->get_tensor_view_type();
+            auto& tensor = tv->get_tensor();
+            auto rank = tvt->get_shape().size();
+            auto native_axis_order =
+                ngraph::runtime::cpu::LayoutDescriptor::create_native_axis_order(rank);
+            auto layout =
+                std::make_shared<ngraph::runtime::cpu::LayoutDescriptor>(*tv, native_axis_order);
+            if (tensor.is_output() || tensor.is_input() || tensor.is_constant())
+            {
+                // Set the MKLDNN format to native row-major variants
+                layout->set_mkldnn_format(mkldnn_utils::CreateNativeDataFormat(*layout));
+            }
+            else
+            {
+                if (ngraph::runtime::cpu::mkldnn_utils::IsMKLDNNOp(*node))
+                {
+                    // TODO(jmenon): get_inputs is marked as to-be-deprecated
+                    // but get_input_ops isn't a suitable API so this needs to be
+                    // reworked
+                    for (const descriptor::Input& input : node->get_inputs())
+                    {
+                        const auto& output = input.get_output();
+                        auto output_tv = output.get_tensor_view();
+                        auto output_tvl = output_tv->get_tensor_view_layout();
+                        // TODO(jmenon): Propagate layout based on inputs
+                        // TODO(jmenon): Insert layout conversions when needed
+                    }
+                }
+                else
+                {
+                    layout->set_mkldnn_format(mkldnn::memory::format::format_undef);
+                }
+            }
+            tv->set_tensor_view_layout(layout);
+        }
+    }
+    return false;
+}
--- a/src/ngraph/runtime/cpu/pass/cpu_layout.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include "ngraph/pass/pass.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace pass
+            {
+                class CPULayout : public ngraph::pass::CallGraphPass
+                {
+                public:
+                    virtual bool
+                        run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes) override;
+                };
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/tensor_view.hpp
+++ b/src/ngraph/runtime/tensor_view.hpp
@@ -21,6 +21,7 @@
 #include "ngraph/descriptor/tensor_view.hpp"
 #include "ngraph/shape.hpp"
+#include "ngraph/types/element_type.hpp"
 #include "ngraph/util.hpp"
 namespace ngraph

--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp
@@ -144,7 +144,7 @@ TEST(cpu_fusion, cpu_fusion_pass_basic)
    auto add = dot + broadcast;
    auto graph = make_shared<op::Abs>(add);
    pass::Manager pass_manager;
-    pass_manager.register_pass<pass::CPUFusion>();
+    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
    auto func = make_shared<Function>(graph, op::Parameters{A, B, C});
    pass_manager.run_passes(func);
    ASSERT_NE(std::dynamic_pointer_cast<op::MatmulBias>(graph->get_input_op(0)), nullptr);
@@ -157,7 +157,7 @@ TEST(cpu_fusion, gemm_mlp)
    stringstream ss(json_string);
    shared_ptr<Function> func = ngraph::deserialize(ss);
    pass::Manager pass_manager;
-    pass_manager.register_pass<pass::CPUFusion>();
+    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
    pass_manager.run_passes(func);
    size_t ccg = count_ops_of_type<op::MatmulBias>(func);
    ASSERT_EQ(ccg, 3);