Merge remote-tracking branch 'origin/master' into pruthvi/mkldnn_elementwise_add

eabfebe5 · pthoreho · 16a18d29 · d0f8dff2 · eabfebe5 · eabfebe5
Commit eabfebe5 authored Feb 22, 2018 by pthoreho
25 changed files
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -183,6 +183,7 @@ if (NGRAPH_CPU_ENABLE AND LLVM_INCLUDE_DIR AND
        runtime/cpu/mkldnn_utils.cpp
        runtime/cpu/ops/convert_layout.cpp
        runtime/cpu/ops/matmul_bias.cpp
+        runtime/cpu/pass/cpu_assignment.cpp
        runtime/cpu/pass/cpu_fusion.cpp
        runtime/cpu/pass/cpu_layout.cpp
    )

--- a/src/ngraph/builder/reduce_ops.cpp
+++ b/src/ngraph/builder/reduce_ops.cpp
@@ -14,8 +14,10 @@
 * limitations under the License.
 *******************************************************************************/
-#include "ngraph/builder/reduce_ops.hpp"
+#include <numeric>
 #include "ngraph/builder/autobroadcast.hpp"
+#include "ngraph/builder/reduce_ops.hpp"
 #include "ngraph/ops/add.hpp"
 #include "ngraph/ops/divide.hpp"
 #include "ngraph/ops/multiply.hpp"

--- a/src/ngraph/codegen/code_writer.cpp
+++ b/src/ngraph/codegen/code_writer.cpp
@@ -40,7 +40,7 @@ std::string codegen::CodeWriter::generate_temporary_name(std::string prefix)
 {
    std::stringstream ss;
-    ss << prefix << "__" << m_temporary_name_count;
+    ss << prefix << m_temporary_name_count;
    m_temporary_name_count++;
    return ss.str();

--- a/src/ngraph/ops/divide.cpp
+++ b/src/ngraph/ops/divide.cpp
@@ -24,6 +24,6 @@ void ngraph::op::Divide::generate_adjoints(autodiff::Adjoints& adjoints,
    auto x = get_input_op(0);
    auto y = get_input_op(1);
-    adjoints.add_delta(x, delta * shared_from_this() / x);
+    adjoints.add_delta(x, delta / y);
    adjoints.add_delta(y, -delta * shared_from_this() / y);
 }
--- a/src/ngraph/ops/op.hpp
+++ b/src/ngraph/ops/op.hpp
@@ -20,6 +20,7 @@
 #include "ngraph/common.hpp"
 #include "ngraph/node.hpp"
+#include "ngraph/ops/util/op_annotations.hpp"
 namespace ngraph
 {
@@ -28,8 +29,21 @@ namespace ngraph
        /// Root of all actual ops
        class Op : public Node
        {
+        public:
+            void set_op_annotations(std::shared_ptr<ngraph::op::util::OpAnnotations> op_annotations)
+            {
+                m_op_annotations = op_annotations;
+            }
+            std::shared_ptr<ngraph::op::util::OpAnnotations> get_op_annotations() const
+            {
+                return m_op_annotations;
+            }
        protected:
            Op(const std::string& node_type, const Nodes& arguments);
+        private:
+            std::shared_ptr<ngraph::op::util::OpAnnotations> m_op_annotations;
        };
    }
 }
--- a/src/ngraph/ops/util/op_annotations.hpp
+++ b/src/ngraph/ops/util/op_annotations.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+namespace ngraph
+{
+    namespace op
+    {
+        namespace util
+        {
+            /// \brief Abstract base class for annotations added to graph ops
+            class OpAnnotations
+            {
+            public:
+                OpAnnotations() {}
+            };
+        }
+    }
+}
--- a/src/ngraph/pattern/core_fusion.cpp
+++ b/src/ngraph/pattern/core_fusion.cpp
@@ -15,13 +15,13 @@
 *******************************************************************************/
 #include <algorithm>
-#include <iostream>
 #include <unordered_set>
 #include "ngraph/pattern/core_fusion.hpp"
 #include "ngraph/graph_util.hpp"
 #include "ngraph/log.hpp"
+#include "ngraph/ops/broadcast.hpp"
 #include "ngraph/ops/constant.hpp"
 #include "ngraph/ops/maximum.hpp"
 #include "ngraph/ops/parameter.hpp"
@@ -51,7 +51,12 @@ void pass::CoreFusion::construct_relu_pattern()
    auto iconst0 = construct_constant_node(0);
    auto val = make_shared<pattern::op::Label>(iconst0);
    auto zero = make_shared<pattern::op::Label>(iconst0, nullptr, Nodes{iconst0});
-    auto max = make_shared<op::Maximum>(zero, val);
+    auto broadcast_pred = [](std::shared_ptr<Node> n) {
+        return static_cast<bool>(std::dynamic_pointer_cast<op::Broadcast>(n));
+    };
+    auto skip_broadcast = std::make_shared<pattern::op::Any>(zero, broadcast_pred);
+    auto max = make_shared<op::Maximum>(skip_broadcast, val);
    pattern::gr_callback_fn callback = [val, zero](pattern::Matcher& m) {
        NGRAPH_DEBUG << "In a callback for construct_relu_pattern against "

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -87,6 +87,8 @@
 #include "ngraph/ops/tanh.hpp"
 #include "ngraph/runtime/cpu/cpu_emitter.hpp"
 #include "ngraph/runtime/cpu/cpu_kernel_emitters.hpp"
+#include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
+#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
 #include "ngraph/runtime/cpu/ops/convert_layout.hpp"
 #include "ngraph/runtime/cpu/ops/matmul_bias.hpp"
 #include "ngraph/types/element_type.hpp"
@@ -116,32 +118,6 @@ static string eigen_matrix_format(const ngraph::Shape& shape, const ngraph::Stri
    return ss.str();
 }
-// Mapping from POD types to MKLDNN data types
-// An empty string implies the corresponding MKLDNN data type
-// is not supported
-static const unordered_map<string, const string> mkldnn_data_type_map{
-    {"char", "memory::data_type::s8"},
-    {"float", "memory::data_type::f32"},
-    {"double", ""},
-    {"int8_t", "memory::data_type::s8"},
-    {"int16_t", "memory::data_type::s16"},
-    {"int32_t", "memory::data_type::s32"},
-    {"int64_t", ""},
-    {"uint8_t", "memory::data_type::u8"},
-    {"uint16_t", ""},
-    {"uint32_t", ""},
-    {"uint64_t", ""}};
-static const string& get_mkldnn_data_type(const string& type)
-{
-    auto it = mkldnn_data_type_map.find(type);
-    if (it == mkldnn_data_type_map.end() || it->second.empty())
-    {
-        throw ngraph_error("No MKLDNN data type exists for the given element type");
-    }
-    return it->second;
-}
 void runtime::cpu::CPU_Emitter::emit_mkldnn_preamble(codegen::CodeWriter& writer)
 {
    writer << "// MKLDNN Preamble\n";
@@ -340,7 +316,9 @@ namespace ngraph
                auto result_shape = out[0].get_shape();
                // get input element type
-                const string& et = get_mkldnn_data_type(args[2].get_element_type().c_type_string());
+                const string& et = runtime::cpu::mkldnn_utils::get_mkldnn_data_type_string(
+                    args[2].get_element_type());
                writer << "{\n";
                writer.indent++;
@@ -2081,53 +2059,45 @@ namespace ngraph
                auto arg0_shape = args[0].get_shape();
                auto arg1_shape = args[1].get_shape();
                auto result_shape = out[0].get_shape();
-                auto arg0_rank = arg0_shape.size();
-                auto arg1_rank = arg1_shape.size();
-                bool filter_dilated = false;
-                for (size_t s : convolution->get_window_dilation_strides())
-                {
-                    filter_dilated = filter_dilated || (s != 1);
-                }
-                bool data_dilated = false;
-                for (size_t s : convolution->get_data_dilation_strides())
-                {
-                    data_dilated = data_dilated || (s != 1);
-                }
-                if (!data_dilated && arg0_rank == 4 && arg1_rank == 4 &&
-                    args[0].get_element_type() == element::f32)
-                {
-                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
-                    auto input_data_desc = mkldnn_emitter->build_memory_descriptor(
-                        args[0], mkldnn::memory::format::nchw);
-                    auto weights_desc = mkldnn_emitter->build_memory_descriptor(
-                        args[1], mkldnn::memory::format::oihw);
-                    auto result_desc = mkldnn_emitter->build_memory_descriptor(
-                        out[0], mkldnn::memory::format::nchw);
-                    size_t conv_index = 0;
-                    if (!filter_dilated)
+                auto op_annotations =
-                    {
+                    static_cast<const ngraph::op::Op*>(node)->get_op_annotations();
-                        conv_index = mkldnn_emitter->build_convolution_forward(
+                if (op_annotations &&
-                            input_data_desc,
+                    static_pointer_cast<ngraph::runtime::cpu::CPUOpAnnotations>(op_annotations)
-                            weights_desc,
+                        ->is_mkldnn_op())
-                            result_desc,
-                            convolution->get_window_movement_strides(),
-                            convolution->get_padding_below(),
-                            convolution->get_padding_above());
-                    }
-                    else
                {
                    // For dilation, MKLDNN wants to know how many elements to insert between, not how far
                    // apart to space the elements like nGraph. So we have to subtract 1 from each pos.
                    Strides window_dilation_strides_adjusted;
                    for (size_t s : convolution->get_window_dilation_strides())
                    {
                        window_dilation_strides_adjusted.push_back(s - 1);
                    }
+                    auto input_tvl = node->get_inputs()[0]
+                                         .get_output()
+                                         .get_tensor_view()
+                                         ->get_tensor_view_layout();
+                    auto weights_tvl = node->get_inputs()[1]
+                                           .get_output()
+                                           .get_tensor_view()
+                                           ->get_tensor_view_layout();
+                    auto output_tvl = node->get_output_tensor_view(0)->get_tensor_view_layout();
+                    auto input_format = dynamic_cast<runtime::cpu::LayoutDescriptor&>(*input_tvl)
+                                            .get_mkldnn_format();
+                    auto weights_format =
+                        dynamic_cast<runtime::cpu::LayoutDescriptor&>(*weights_tvl)
+                            .get_mkldnn_format();
+                    auto output_format = dynamic_cast<runtime::cpu::LayoutDescriptor&>(*output_tvl)
+                                             .get_mkldnn_format();
+                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                    auto input_data_desc =
+                        mkldnn_emitter->build_memory_descriptor(args[0], input_format);
+                    auto weights_desc =
+                        mkldnn_emitter->build_memory_descriptor(args[1], weights_format);
+                    auto result_desc =
+                        mkldnn_emitter->build_memory_descriptor(out[0], output_format);
+                    size_t conv_index = 0;
                    conv_index = mkldnn_emitter->build_convolution_forward(
                        input_data_desc,
@@ -2137,7 +2107,6 @@ namespace ngraph
                        window_dilation_strides_adjusted,
                        convolution->get_padding_below(),
                        convolution->get_padding_above());
-                    }
                    auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0])
@@ -2194,7 +2163,8 @@ namespace ngraph
                    args[0].get_element_type() == element::f32)
                {
                    const string& elem_type =
-                        get_mkldnn_data_type(args[0].get_element_type().c_type_string());
+                        runtime::cpu::mkldnn_utils::get_mkldnn_data_type_string(
+                            args[0].get_element_type());
                    Strides window_dilation_strides_adjusted;
                    for (size_t s : convolution->get_window_dilation_strides_forward())
@@ -2304,7 +2274,8 @@ namespace ngraph
                    args[0].get_element_type() == element::f32)
                {
                    const string& elem_type =
-                        get_mkldnn_data_type(args[0].get_element_type().c_type_string());
+                        runtime::cpu::mkldnn_utils::get_mkldnn_data_type_string(
+                            args[0].get_element_type());
                    Strides window_dilation_strides_adjusted;
                    for (size_t s : convolution->get_window_dilation_strides_forward())
@@ -2418,8 +2389,8 @@ namespace ngraph
                if (arg_rank == 4 && max_pool->get_window_shape().size() == 2 &&
                    args[0].get_element_type() == element::f32)
                {
-                    const string& et =
+                    const string& et = runtime::cpu::mkldnn_utils::get_mkldnn_data_type_string(
-                        get_mkldnn_data_type(args[0].get_element_type().c_type_string());
+                        args[0].get_element_type());
                    writer << "{\n";
                    writer.indent++;
@@ -2601,8 +2572,8 @@ namespace ngraph
                if (arg_rank == 4 && avg_pool->get_window_shape().size() == 2 &&
                    args[0].get_element_type() == element::f32)
                {
-                    const string& et =
+                    const string& et = runtime::cpu::mkldnn_utils::get_mkldnn_data_type_string(
-                        get_mkldnn_data_type(args[0].get_element_type().c_type_string());
+                        args[0].get_element_type());
                    writer << "{\n";
                    writer.indent++;
@@ -2681,8 +2652,8 @@ namespace ngraph
                if (delta_rank == 4 && apb->get_window_shape().size() == 2 &&
                    args[0].get_element_type() == element::f32)
                {
-                    const string& et =
+                    const string& et = runtime::cpu::mkldnn_utils::get_mkldnn_data_type_string(
-                        get_mkldnn_data_type(args[0].get_element_type().c_type_string());
+                        args[0].get_element_type());
                    writer << "{\n";
                    writer.indent++;
@@ -2749,8 +2720,8 @@ namespace ngraph
                if (delta_rank == 4 && mpb->get_window_shape().size() == 2 &&
                    args[0].get_element_type() == element::f32)
                {
-                    const string& et =
+                    const string& et = runtime::cpu::mkldnn_utils::get_mkldnn_data_type_string(
-                        get_mkldnn_data_type(args[1].get_element_type().c_type_string());
+                        args[1].get_element_type());
                    writer << "{\n";
                    writer.indent++;
@@ -3068,13 +3039,51 @@ namespace ngraph
                writer << "}\n";
            }
+            template <>
+            void CPU_Emitter::EMITTER_DECL(ngraph::runtime::cpu::op::ConvertLayout)
+            {
+                auto input_tvl =
+                    node->get_inputs()[0].get_output().get_tensor_view()->get_tensor_view_layout();
+                auto output_tvl = node->get_output_tensor_view(0)->get_tensor_view_layout();
+                auto input_format =
+                    dynamic_cast<runtime::cpu::LayoutDescriptor&>(*input_tvl).get_mkldnn_format();
+                auto output_format =
+                    dynamic_cast<runtime::cpu::LayoutDescriptor&>(*output_tvl).get_mkldnn_format();
+                const string& et = runtime::cpu::mkldnn_utils::get_mkldnn_data_type_string(
+                    args[0].get_element_type());
+                writer << "{\n";
+                writer.indent++;
+                writer << "engine cpu_engine = engine(engine::cpu, 0);\n";
+                writer << "memory::desc input_desc = memory::desc({" << join(args[0].get_shape())
+                       << "}, " << et << ", "
+                       << runtime::cpu::mkldnn_utils::get_mkldnn_format_string(input_format)
+                       << ");\n";
+                writer << "memory::desc output_desc = memory::desc({" << join(out[0].get_shape())
+                       << "}, " << et << ", "
+                       << runtime::cpu::mkldnn_utils::get_mkldnn_format_string(output_format)
+                       << ");\n";
+                writer << "memory input = memory({input_desc, cpu_engine}, " << args[0].get_name()
+                       << ");\n";
+                writer << "memory output = memory({output_desc, cpu_engine}, " << out[0].get_name()
+                       << ");\n";
+                writer << "reorder prim = reorder(input, output);\n";
+                writer << "stream s = stream(stream::kind::eager);\n"
+                       << "s.submit({prim}).wait();\n";
+                writer.indent--;
+                writer << "}\n";
+            }
            template <>
            void CPU_Emitter::EMITTER_DECL(ngraph::op::ReluBackprop)
            {
                const auto& arg_shape = args[0].get_shape();
                const size_t arg_rank = arg_shape.size();
                const auto& result_shape = out[0].get_shape();
-                const string& et = get_mkldnn_data_type(args[0].get_element_type().c_type_string());
+                const string& et = runtime::cpu::mkldnn_utils::get_mkldnn_data_type_string(
+                    args[0].get_element_type());
                if (arg_rank == 4 && args[0].get_element_type() == element::f32)
                {
                    writer << "{\n";
@@ -3139,7 +3148,8 @@ namespace ngraph
                const auto& arg_shape = args[0].get_shape();
                const size_t arg_rank = arg_shape.size();
                const auto& result_shape = out[0].get_shape();
-                const string& et = get_mkldnn_data_type(args[0].get_element_type().c_type_string());
+                const string& et = runtime::cpu::mkldnn_utils::get_mkldnn_data_type_string(
+                    args[0].get_element_type());
                if (arg_rank == 4 && args[0].get_element_type() == element::f32)
                {
                    writer << "{\n";

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -106,7 +106,9 @@
 #include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
 #include "ngraph/runtime/cpu/cpu_tracing.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+#include "ngraph/runtime/cpu/ops/convert_layout.hpp"
 #include "ngraph/runtime/cpu/ops/matmul_bias.hpp"
+#include "ngraph/runtime/cpu/pass/cpu_assignment.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_fusion.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_layout.hpp"
@@ -218,6 +220,8 @@ static const runtime::cpu::OpMap dispatcher{
     &runtime::cpu::CPU_Emitter::emit<op::ConvolutionBackpropFilters>},
    {TI(ngraph::op::ConvolutionBackpropData),
     &runtime::cpu::CPU_Emitter::emit<op::ConvolutionBackpropData>},
+    {TI(ngraph::runtime::cpu::op::ConvertLayout),
+     &runtime::cpu::CPU_Emitter::emit<runtime::cpu::op::ConvertLayout>},
    {TI(ngraph::op::Not), &runtime::cpu::CPU_Emitter::emit<op::Not>},
    {TI(ngraph::op::MaxPool), &runtime::cpu::CPU_Emitter::emit<op::MaxPool>},
    {TI(ngraph::op::Reverse), &runtime::cpu::CPU_Emitter::emit<op::Reverse>},
@@ -259,7 +263,8 @@ void runtime::cpu::CPU_ExternalFunction::compile()
    pass_manager.register_pass<ngraph::pass::CoreFusion>();
    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
-    pass_manager.register_pass<runtime::cpu::pass::CPULayout>();
+    pass_manager.register_pass<runtime::cpu::pass::CPUAssignment>(shared_from_this());
+    pass_manager.register_pass<runtime::cpu::pass::CPULayout>(shared_from_this());
    pass_manager.register_pass<ngraph::pass::Liveness>();
    pass_manager.register_pass<ngraph::pass::MemoryLayout>(s_memory_pool_alignment);
@@ -279,13 +284,8 @@ void runtime::cpu::CPU_ExternalFunction::compile()
    }
    writer +=
-        R"(// Generated by the NGraph CPU backend
+        R"(// Generated by the nGraph CPU backend
 #include <cmath>
-)";
-    writer +=
-        R"(#include <Eigen/Dense>
 #include "ngraph/except.hpp"
 #include "ngraph/runtime/aligned_buffer.hpp"
 #include "ngraph/runtime/cpu/cpu_eigen_utils.hpp"
@@ -417,7 +417,7 @@ using namespace ngraph::runtime;
    {
        for (shared_ptr<Node> node : current_function->get_ordered_ops())
        {
-            const op::Constant* c = dynamic_cast<op::Constant*>(node.get());
+            const ngraph::op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
            if (c)
            {
                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
@@ -462,7 +462,14 @@ using namespace ngraph::runtime;
            {
                continue;
            }
            Node& node = *op_list[i];
+            auto handler = dispatcher.find(type_index(typeid(node)));
+            if (handler == dispatcher.end())
+            {
+                throw ngraph_error("Unhandled op during code generation : " + node.description());
+            }
            string s = emit_op_as_function(node, "f");
            node_cache.insert({&node, s});
        }
@@ -509,7 +516,7 @@ using namespace ngraph::runtime;
        set<descriptor::TensorView*> constants;
        for (shared_ptr<Node> node : current_function->get_ordered_ops())
        {
-            if (dynamic_cast<op::Constant*>(node.get()))
+            if (dynamic_cast<ngraph::op::Constant*>(node.get()))
            {
                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
                constants.insert(tv.get());
@@ -573,7 +580,7 @@ using namespace ngraph::runtime;
        // Add inputs to the variable name map
        size_t arg_index = 0;
-        for (shared_ptr<op::Parameter> param : current_function->get_parameters())
+        for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
        {
            for (size_t i = 0; i < param->get_output_size(); ++i)
            {
@@ -612,7 +619,7 @@ using namespace ngraph::runtime;
            shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
            const element::Type& et = tv->get_tensor_view_type()->get_element_type();
            bool parameter_as_output = false;
-            for (shared_ptr<op::Parameter> param : current_function->get_parameters())
+            for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
            {
                for (const descriptor::Output& pout : param->get_outputs())
                {
@@ -657,7 +664,8 @@ using namespace ngraph::runtime;
                throw ngraph_error("Unhandled op during code generation : " + node->description());
            }
            vector<TensorViewWrapper> in;
-            vector<string> node_input_names, node_output_names;
+            vector<string> node_input_names;
+            vector<string> node_output_names;
            for (const descriptor::Input& input : node->get_inputs())
            {
                const descriptor::Output& output = input.get_output();
@@ -702,19 +710,23 @@ using namespace ngraph::runtime;
                }
            }
+            writer << "\n// " << node->get_name() << "(";
+            vector<string> parameter_nodes = node_input_names;
+            parameter_nodes.insert(
+                parameter_nodes.end(), node_output_names.begin(), node_output_names.end());
+            writer << join(parameter_nodes);
+            writer << ")\n";
            // Emit operation body
            string func_name;
            auto it = match_functions.find(node.get());
-            if (it != match_functions.end())
+            if (it == match_functions.end())
-            {
-                func_name = it->second;
-            }
-            if (func_name.empty())
            {
                handler->second(this, writer, node.get(), in, out);
            }
            else
            {
+                func_name = it->second;
                vector<string> names;
                for (const TensorViewWrapper& tv : in)
                {

--- a/src/ngraph/runtime/cpu/cpu_kernel_emitters.cpp
+++ b/src/ngraph/runtime/cpu/cpu_kernel_emitters.cpp
@@ -30,9 +30,6 @@ string emit_bracketed_string(vector<T> data)
 {
    stringstream ss;
-    if (data.size() == 0)
-        return "";
    for (auto s : data)
    {
        ss << "[" << s << "]";
@@ -75,7 +72,7 @@ vector<string>
    vector<string> index_vars;
    for (size_t i = 0; i < top.size(); i++)
    {
-        string index_var = writer.generate_temporary_name("i");
+        string index_var = writer.generate_temporary_name("_i");
        writer << runtime::cpu::kernel::start_index_loop(index_var, new_bottom[i], top[i], i == 0);
        writer.indent++;

--- a/src/ngraph/runtime/cpu/cpu_kernel_utils.cpp
+++ b/src/ngraph/runtime/cpu/cpu_kernel_utils.cpp
@@ -149,7 +149,7 @@ string ngraph::runtime::cpu::kernel::end_index_loop(const string& index_var)
 {
    stringstream ss;
-    ss << "} // end for(" << index_var << ")\n";
+    ss << "}\n";
    return ss.str();
 }
@@ -209,7 +209,7 @@ void ngraph::runtime::cpu::kernel::emit_pointwise_copy(codegen::CodeWriter& writ
    for (size_t i = 0; i < n_axes; i++)
    {
-        string index_var = writer.generate_temporary_name("i");
+        string index_var = writer.generate_temporary_name("_j");
        writer << start_index_loop(index_var, source_start_corner[i], source_end_corner[i], i == 0);
        writer.indent++;

--- a/src/ngraph/runtime/cpu/cpu_op_annotations.hpp
+++ b/src/ngraph/runtime/cpu/cpu_op_annotations.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include "ngraph/ops/util/op_annotations.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            /// \brief Annotations added to graph ops by CPU backend passes
+            class CPUOpAnnotations : public ngraph::op::util::OpAnnotations
+            {
+            public:
+                CPUOpAnnotations() { m_mkldnn_op = false; }
+                bool is_mkldnn_op() { return m_mkldnn_op; }
+                void set_mkldnn_op(bool val) { m_mkldnn_op = val; }
+            private:
+                bool m_mkldnn_op;
+            };
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/cpu_tensor_view.cpp
+++ b/src/ngraph/runtime/cpu/cpu_tensor_view.cpp
@@ -22,8 +22,10 @@
 #include "ngraph/descriptor/primary_tensor_view.hpp"
 #include "ngraph/except.hpp"
 #include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
+#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
 #include "ngraph/shape.hpp"
+using namespace mkldnn;
 using namespace ngraph;
 using namespace std;
@@ -101,8 +103,34 @@ void runtime::cpu::CPUTensorView::read(void* target, size_t tensor_offset, size_
    {
        throw out_of_range("read access past end of tensor");
    }
+    auto tvl = this->get_tensor_view_layout();
+    auto cpu_tvl = dynamic_cast<runtime::cpu::LayoutDescriptor*>(tvl.get());
+    if (cpu_tvl && cpu_tvl->get_mkldnn_format() != memory::format::format_undef &&
+        cpu_tvl->get_mkldnn_format() !=
+            runtime::cpu::mkldnn_utils::CreateNativeDataFormat(*cpu_tvl))
+    {
+        auto tensor_shape = this->get_shape();
+        auto input_format = cpu_tvl->get_mkldnn_format();
+        auto output_format = runtime::cpu::mkldnn_utils::CreateNativeDataFormat(*cpu_tvl);
+        memory::data_type et = runtime::cpu::mkldnn_utils::get_mkldnn_data_type(
+            this->get_descriptor()->get_tensor_view_type()->get_element_type());
+        engine cpu_engine{engine::cpu, 0};
+        memory::dims mkldnn_shape{tensor_shape.begin(), tensor_shape.end()};
+        memory::desc input_desc{mkldnn_shape, et, input_format};
+        memory::desc output_desc{mkldnn_shape, et, output_format};
+        memory input{{input_desc, cpu_engine}, aligned_buffer};
+        memory output{{output_desc, cpu_engine}, target};
+        reorder prim{input, output};
+        mkldnn::stream s(mkldnn::stream::kind::eager);
+        s.submit({prim}).wait();
+    }
+    else
+    {
        const char* source = get_data_ptr();
        memcpy(target, &source[tensor_offset], n);
+    }
 }
 size_t runtime::cpu::CPUTensorView::get_size() const

--- a/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
@@ -15,6 +15,7 @@
 *******************************************************************************/
 #include <memory>
+#include <string>
 #include "mkldnn_emitter.hpp"
@@ -45,7 +46,7 @@ mkldnn::memory::desc MKLDNNEmitter::build_memory_descriptor(const TensorViewWrap
 {
    return mkldnn::memory::desc(
        mkldnn::memory::dims(tvw.get_shape().begin(), tvw.get_shape().end()),
-        mkldnn_utils::GetDataType(tvw.get_element_type()),
+        mkldnn_utils::get_mkldnn_data_type(tvw.get_element_type()),
        fmt);
 }

--- a/src/ngraph/runtime/cpu/mkldnn_invoke.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_invoke.cpp
@@ -14,6 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
+#include <string>
 #include <mkldnn.hpp>
 #include "mkldnn_invoke.hpp"

--- a/src/ngraph/runtime/cpu/mkldnn_utils.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.cpp
@@ -17,9 +17,9 @@
 #include <string>
 #include <typeindex>
 #include <typeinfo>
-#include <unordered_map>
 #include <unordered_set>
+#include "ngraph/types/element_type.hpp"
 #include "ngraph/node.hpp"
 #include "ngraph/ops/add.hpp"
 #include "ngraph/ops/avg_pool.hpp"
@@ -30,17 +30,12 @@
 #include "mkldnn_utils.hpp"
-namespace ngraph
+using namespace mkldnn;
-{
+using namespace ngraph;
-    namespace runtime
-    {
-        namespace cpu
-        {
-            namespace mkldnn_utils
-            {
 #define TI(x) std::type_index(typeid(x))
-                static const std::unordered_set<std::type_index> s_op_registry{
+static const std::unordered_set<std::type_index> s_op_registry{
    TI(ngraph::op::Add),
    TI(ngraph::op::AvgPool),
    TI(ngraph::op::AvgPoolBackprop),
@@ -53,36 +48,71 @@ namespace ngraph
    TI(ngraph::op::Relu),
    TI(ngraph::op::ReluBackprop)};
-                static const std::unordered_map<std::string, const mkldnn::memory::data_type>
+// Mapping from POD types to MKLDNN data types
-                    s_data_type_map{{"char", mkldnn::memory::data_type::s8},
+static const std::map<element::Type, const mkldnn::memory::data_type> s_mkldnn_data_type_map{
-                                    {"float", mkldnn::memory::data_type::f32},
+    {element::boolean, mkldnn::memory::data_type::s8},
-                                    {"double", mkldnn::memory::data_type::data_undef},
+    {element::f32, mkldnn::memory::data_type::f32},
-                                    {"int8_t", mkldnn::memory::data_type::s8},
+    {element::f64, mkldnn::memory::data_type::data_undef},
-                                    {"int16_t", mkldnn::memory::data_type::s16},
+    {element::i8, mkldnn::memory::data_type::s8},
-                                    {"int32_t", mkldnn::memory::data_type::s32},
+    {element::i16, mkldnn::memory::data_type::s16},
-                                    {"int64_t", mkldnn::memory::data_type::data_undef},
+    {element::i32, mkldnn::memory::data_type::s32},
-                                    {"uint8_t", mkldnn::memory::data_type::u8},
+    {element::i64, mkldnn::memory::data_type::data_undef},
-                                    {"uint16_t", mkldnn::memory::data_type::data_undef},
+    {element::u8, mkldnn::memory::data_type::u8},
-                                    {"uint32_t", mkldnn::memory::data_type::data_undef},
+    {element::u16, mkldnn::memory::data_type::data_undef},
-                                    {"uint64_t", mkldnn::memory::data_type::data_undef}};
+    {element::u32, mkldnn::memory::data_type::data_undef},
+    {element::u64, mkldnn::memory::data_type::data_undef}};
-                mkldnn::memory::data_type GetDataType(const ngraph::element::Type& et)
+static const std::map<element::Type, const std::string> s_mkldnn_data_type_string_map{
-                {
+    {element::boolean, "mkldnn::memory::data_type::s8"},
-                    auto it = s_data_type_map.find(et.c_type_string());
+    {element::f32, "mkldnn::memory::data_type::f32"},
-                    if (it == s_data_type_map.end() ||
+    {element::f64, "mkldnn::memory::data_type::data_undef"},
-                        it->second == mkldnn::memory::data_type::data_undef)
+    {element::i8, "mkldnn::memory::data_type::s8"},
-                        throw ngraph_error("No MKLDNN data type exists for the given element type");
+    {element::i16, "mkldnn::memory::data_type::s16"},
-                    return it->second;
+    {element::i32, "mkldnn::memory::data_type::s32"},
-                }
+    {element::i64, "mkldnn::memory::data_type::data_undef"},
+    {element::u8, "mkldnn::memory::data_type::u8"},
+    {element::u16, "mkldnn::memory::data_type::data_undef"},
+    {element::u32, "mkldnn::memory::data_type::data_undef"},
+    {element::u64, "mkldnn::memory::data_type::data_undef"}};
-                bool IsMKLDNNOp(ngraph::Node& op)
+// TODO (jbobba): Add the rest of memory formats to this map as well
-                {
+static const std::map<memory::format, const std::string> s_mkldnn_format_string_map{
+    {memory::format::format_undef, "memory::format::format_undef"},
+    {memory::format::any, "memory::format::any"},
+    {memory::format::blocked, "memory::format::blocked"},
+    {memory::format::x, "memory::format::x"},
+    {memory::format::nc, "memory::format::nc"},
+    {memory::format::nchw, "memory::format::nchw"},
+    {memory::format::nhwc, "memory::format::nhwc"},
+    {memory::format::chwn, "memory::format::chwn"},
+    {memory::format::nChw8c, "memory::format::nChw8c"},
+    {memory::format::nChw16c, "memory::format::nChw16c"},
+    {memory::format::oi, "memory::format::oi"},
+    {memory::format::io, "memory::format::io"},
+    {memory::format::oihw, "memory::format::oihw"},
+    {memory::format::ihwo, "memory::format::ihwo"},
+    {memory::format::hwio, "memory::format::hwio"},
+    {memory::format::oIhw8i, "memory::format::oIhw8i"},
+    {memory::format::oIhw16i, "memory::format::oIhw16i"},
+    {memory::format::OIhw8i8o, "memory::format::OIhw8i8o"},
+    {memory::format::OIhw16i16o, "memory::format::OIhw16i16o"},
+    {memory::format::OIhw8o8i, "memory::format::OIhw8o8i"},
+    {memory::format::OIhw16o16i, "memory::format::OIhw16o16i"},
+    {memory::format::Oihw8o, "memory::format::Oihw8o"},
+    {memory::format::Oihw16o, "memory::format::Oihw16o"},
+    {memory::format::Ohwi8o, "memory::format::Ohwi8o"},
+    {memory::format::Ohwi16o, "memory::format::Ohwi16o"},
+    {memory::format::OhIw16o4i, "memory::format::OhIw16o4i"},
+};
+bool runtime::cpu::mkldnn_utils::IsMKLDNNOp(ngraph::Node& op)
+{
    return (s_op_registry.find(TI(op)) != s_op_registry.end());
-                }
+}
-                mkldnn::memory::format
+mkldnn::memory::format runtime::cpu::mkldnn_utils::CreateNativeDataFormat(
-                    CreateNativeDataFormat(const ngraph::runtime::cpu::LayoutDescriptor& layout)
+    const ngraph::runtime::cpu::LayoutDescriptor& layout)
-                {
+{
    switch (layout.get_shape().size())
    {
    case 1: return mkldnn::memory::format::x;
@@ -90,8 +120,31 @@ namespace ngraph
    case 4: return mkldnn::memory::format::nchw;
    default: return mkldnn::memory::format::format_undef;
    }
+}
+const std::string& runtime::cpu::mkldnn_utils::get_mkldnn_data_type_string(const ngraph::element::Type& type)
+{
+    auto it = s_mkldnn_data_type_string_map.find(type);
+    if (it == s_mkldnn_data_type_string_map.end() || it->second.empty())
+        throw ngraph_error("No MKLDNN data type exists for the given element type");
+    return it->second;
+}
+mkldnn::memory::data_type runtime::cpu::mkldnn_utils::get_mkldnn_data_type(const ngraph::element::Type& type)
+{
+    auto it = s_mkldnn_data_type_map.find(type);
+    if (it == s_mkldnn_data_type_map.end() || it->second == memory::data_type::data_undef)
+    {
+        throw ngraph_error("No MKLDNN data type exists for the given element type");
    }
-            }
+    return it->second;
-        }
+}
-    }
+const std::string& runtime::cpu::mkldnn_utils::get_mkldnn_format_string(memory::format fmt)
+{
+    auto it = s_mkldnn_format_string_map.find(fmt);
+    if (it == s_mkldnn_format_string_map.end())
+        throw ngraph_error("No MKLDNN format exists for the given format type " +
+                           std::to_string(fmt));
+    return it->second;
 }
--- a/src/ngraph/runtime/cpu/mkldnn_utils.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.hpp
@@ -32,12 +32,12 @@ namespace ngraph
            {
                extern mkldnn::engine global_cpu_engine;
-                mkldnn::memory::data_type GetDataType(const ngraph::element::Type& et);
                bool IsMKLDNNOp(ngraph::Node& op);
                mkldnn::memory::format
                    CreateNativeDataFormat(const ngraph::runtime::cpu::LayoutDescriptor& layout);
+                const std::string& get_mkldnn_data_type_string(const ngraph::element::Type& type);
+                mkldnn::memory::data_type get_mkldnn_data_type(const ngraph::element::Type& type);
+                const std::string& get_mkldnn_format_string(mkldnn::memory::format fmt);
            }
        }
    }

--- a/src/ngraph/runtime/cpu/ops/convert_layout.cpp
+++ b/src/ngraph/runtime/cpu/ops/convert_layout.cpp
@@ -39,7 +39,7 @@ runtime::cpu::op::ConvertLayout::ConvertLayout(
    if (!arg_layout)
    {
-        throw ngraph_error("Layout conversion input tensor is missing layout information");
+        //throw ngraph_error("Layout conversion input tensor is missing layout information");
    }
    add_output(layout->get_element_type(), layout->get_shape());

--- a/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "ngraph/runtime/cpu/pass/cpu_assignment.hpp"
+#include <algorithm>
+#include <cassert>
+#include <memory>
+#include <typeindex>
+#include <typeinfo>
+#include <unordered_map>
+#include <mkldnn.hpp>
+#include "ngraph/descriptor/output.hpp"
+#include "ngraph/ops/convolution.hpp"
+#include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
+#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace pass
+            {
+                template <>
+                void CPUAssignment::ASSIGN_DECL(ngraph::op::Convolution)
+                {
+                    auto convolution = static_cast<op::Convolution*>(node);
+                    auto arg0_shape = node->get_input_shape(0);
+                    auto arg1_shape = node->get_input_shape(1);
+                    auto result_shape = node->get_output_shape(0);
+                    auto arg0_rank = arg0_shape.size();
+                    auto arg1_rank = arg1_shape.size();
+                    bool data_dilated = false;
+                    for (size_t s : convolution->get_data_dilation_strides())
+                    {
+                        data_dilated = data_dilated || (s != 1);
+                    }
+                    if (!data_dilated && arg0_rank == 4 && arg1_rank == 4 &&
+                        node->get_input_element_type(0) == element::f32)
+                    {
+                        auto op_annotations =
+                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                        op_annotations->set_mkldnn_op(true);
+                        convolution->set_op_annotations(op_annotations);
+                    }
+                }
+            }
+        }
+    }
+}
+#define TI(x) type_index(typeid(x))
+static const runtime::cpu::pass::AssignOpMap s_dispatcher{
+    {TI(ngraph::op::Convolution),
+     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Convolution>},
+};
+bool runtime::cpu::pass::CPUAssignment::run_on_call_graph(
+    const std::list<std::shared_ptr<Node>>& nodes)
+{
+    for (const auto& node : nodes)
+    {
+        auto& n = *node;
+        auto handler = s_dispatcher.find(TI(n));
+        if (handler != s_dispatcher.end())
+        {
+            handler->second(m_external_function.get(), node.get());
+        }
+    }
+    return false;
+}
--- a/src/ngraph/runtime/cpu/pass/cpu_assignment.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_assignment.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include "ngraph/pass/pass.hpp"
+#include "ngraph/runtime/cpu/cpu_external_function.hpp"
+#include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
+#define ASSIGN_DECL(op_name)                                                                       \
+    assign<op_name>(ngraph::runtime::cpu::CPU_ExternalFunction * external_function,                \
+                    ngraph::Node * node)
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace pass
+            {
+                using AssignFunction = std::function<void(CPU_ExternalFunction*, ngraph::Node*)>;
+                using AssignOpMap = std::unordered_map<std::type_index, AssignFunction>;
+                class CPUAssignment : public ngraph::pass::CallGraphPass
+                {
+                public:
+                    CPUAssignment(std::shared_ptr<CPU_ExternalFunction> external_function)
+                        : m_external_function(external_function)
+                    {
+                    }
+                    virtual bool
+                        run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes) override;
+                    template <typename OP>
+                    static void
+                        assign(ngraph::runtime::cpu::CPU_ExternalFunction* external_function,
+                               ngraph::Node* node)
+                    {
+                        throw std::runtime_error("Unimplemented op in CPU assignment");
+                    }
+                private:
+                    std::shared_ptr<CPU_ExternalFunction> m_external_function;
+                };
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
@@ -17,21 +17,82 @@
 #include <algorithm>
 #include <memory>
 #include <string>
+#include <typeindex>
+#include <typeinfo>
 #include <mkldnn.hpp>
 #include "cpu_layout.hpp"
 #include "ngraph/descriptor/output.hpp"
+#include "ngraph/graph_util.hpp"
+#include "ngraph/ops/convolution.hpp"
+#include "ngraph/ops/op.hpp"
 #include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
+#include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+#include "ngraph/runtime/cpu/ops/convert_layout.hpp"
-//using namespace ngraph::runtime::cpu::pass;
+using namespace std;
+using namespace mkldnn;
 using namespace ngraph;
-bool runtime::cpu::pass::CPULayout::run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes)
+void runtime::cpu::pass::CPULayout::set_default_layouts(
+    runtime::cpu::CPU_ExternalFunction* external_function, std::shared_ptr<Node> node)
 {
-    for (const auto& node : nodes)
+    std::vector<shared_ptr<Node>> new_args;
+    bool replace_node = false;
+    uint index = 0;
+    for (const descriptor::Input& input : node->get_inputs())
+    {
+        const auto& output = input.get_output();
+        auto tv = output.get_tensor_view();
+        auto tvt = tv->get_tensor_view_type();
+        auto rank = tvt->get_shape().size();
+        auto tvl = tv->get_tensor_view_layout();
+        auto cpu_tvl = dynamic_cast<runtime::cpu::LayoutDescriptor*>(tvl.get());
+        if (cpu_tvl && cpu_tvl->get_mkldnn_format() != memory::format::format_undef &&
+            cpu_tvl->get_mkldnn_format() !=
+                runtime::cpu::mkldnn_utils::CreateNativeDataFormat(*cpu_tvl))
+        {
+            auto native_axis_order =
+                ngraph::runtime::cpu::LayoutDescriptor::create_native_axis_order(rank);
+            auto layout =
+                std::make_shared<ngraph::runtime::cpu::LayoutDescriptor>(*tv, native_axis_order);
+            layout->set_mkldnn_format(runtime::cpu::mkldnn_utils::CreateNativeDataFormat(*cpu_tvl));
+            auto new_node = std::shared_ptr<Node>(
+                new runtime::cpu::op::ConvertLayout(output.get_node(), output.get_index(), layout));
+            new_args.push_back(new_node);
+            replace_node = true;
+            NGRAPH_DEBUG << "Inserted conversion node " << new_node->get_name() << " between "
+                         << output.get_node()->get_name()
+                         << "(layout: " << cpu_tvl->get_mkldnn_format() << ") and "
+                         << node->get_name() << "(layout: default)";
+        }
+        else
+        {
+            new_args.push_back(node->get_input_op(index));
+        }
+        index++;
+    }
+    shared_ptr<Node> new_node;
+    if (replace_node)
+    {
+        new_node = node->copy_with_new_args(new_args);
+        if (node->is_output())
+        {
+            external_function->get_function()->replace_node(node, new_node);
+        }
+        else
        {
+            ngraph::replace_node(node, new_node);
+        }
+        NGRAPH_DEBUG << "Replaced " << node->get_name() << " with " << new_node->get_name();
+        auto old_op_annotations = static_pointer_cast<ngraph::op::Op>(node)->get_op_annotations();
+        static_pointer_cast<ngraph::op::Op>(new_node)->set_op_annotations(old_op_annotations);
+        node = new_node;
+    }
    for (size_t i = 0; i < node->get_output_size(); ++i)
    {
        auto tv = node->get_output_tensor_view(i);
@@ -41,7 +102,6 @@ bool runtime::cpu::pass::CPULayout::run_on_call_graph(const std::list<std::share
        }
        auto tvt = tv->get_tensor_view_type();
-            auto& tensor = tv->get_tensor();
        auto rank = tvt->get_shape().size();
        auto native_axis_order =
@@ -50,34 +110,201 @@ bool runtime::cpu::pass::CPULayout::run_on_call_graph(const std::list<std::share
        auto layout =
            std::make_shared<ngraph::runtime::cpu::LayoutDescriptor>(*tv, native_axis_order);
-            if (tensor.is_output() || tensor.is_input() || tensor.is_constant())
-            {
        // Set the MKLDNN format to native row-major variants
        layout->set_mkldnn_format(mkldnn_utils::CreateNativeDataFormat(*layout));
+        tv->set_tensor_view_layout(layout);
    }
-            else
+}
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace pass
            {
-                if (ngraph::runtime::cpu::mkldnn_utils::IsMKLDNNOp(*node))
+                template <>
+                void CPULayout::LAYOUT_DECL(ngraph::op::Convolution)
                {
-                    // TODO(jmenon): get_inputs is marked as to-be-deprecated
+                    auto op_annotations =
-                    // but get_input_ops isn't a suitable API so this needs to be
+                        static_pointer_cast<ngraph::op::Op>(node)->get_op_annotations();
-                    // reworked
+                    if (op_annotations &&
+                        static_pointer_cast<ngraph::runtime::cpu::CPUOpAnnotations>(op_annotations)
+                            ->is_mkldnn_op())
+                    {
+                        auto convolution = static_cast<const ngraph::op::Convolution*>(node.get());
+                        auto arg0_shape = node->get_input_shape(0);
+                        auto arg1_shape = node->get_input_shape(1);
+                        auto result_shape = node->get_output_shape(0);
+                        auto filter_strides = convolution->get_window_movement_strides();
+                        auto padding_below = convolution->get_padding_below();
+                        auto padding_above = convolution->get_padding_above();
+                        Strides window_dilation_strides_adjusted;
+                        for (size_t s : convolution->get_window_dilation_strides())
+                        {
+                            window_dilation_strides_adjusted.push_back(s - 1);
+                        }
+                        memory::data_type et = runtime::cpu::mkldnn_utils::get_mkldnn_data_type(
+                            node->get_input_element_type(0));
+                        engine cpu_engine(engine::cpu, 0);
+                        memory::dims mkldnn_arg0_shape(arg0_shape.begin(), arg0_shape.end());
+                        memory::dims mkldnn_arg1_shape(arg1_shape.begin(), arg1_shape.end());
+                        memory::dims mkldnn_result_shape(result_shape.begin(), result_shape.end());
+                        memory::dims mkldnn_filter_strides(filter_strides.begin(),
+                                                           filter_strides.end());
+                        memory::dims mkldnn_dilated_strides(
+                            window_dilation_strides_adjusted.begin(),
+                            window_dilation_strides_adjusted.end());
+                        memory::dims mkldnn_padding_below(padding_below.begin(),
+                                                          padding_below.end());
+                        memory::dims mkldnn_padding_above(padding_above.begin(),
+                                                          padding_above.end());
+                        const memory::desc input_data_desc(
+                            mkldnn_arg0_shape, et, memory::format::any);
+                        const memory::desc weights_desc(mkldnn_arg1_shape, et, memory::format::any);
+                        const memory::desc result_desc(
+                            mkldnn_result_shape, et, memory::format::any);
+                        convolution_forward::desc fwd_desc(prop_kind::forward,
+                                                           algorithm::convolution_direct,
+                                                           input_data_desc,
+                                                           weights_desc,
+                                                           result_desc,
+                                                           mkldnn_filter_strides,
+                                                           mkldnn_dilated_strides,
+                                                           mkldnn_padding_below,
+                                                           mkldnn_padding_above,
+                                                           padding_kind::zero);
+                        convolution_forward::primitive_desc prim_desc(fwd_desc, cpu_engine);
+                        memory::format prim_input_formats[2];
+                        memory::format prim_output_formats[1];
+                        prim_input_formats[0] = static_cast<memory::format>(
+                            prim_desc.src_primitive_desc().desc().data.format);
+                        prim_output_formats[0] = static_cast<memory::format>(
+                            prim_desc.dst_primitive_desc().desc().data.format);
+                        prim_input_formats[1] = static_cast<memory::format>(
+                            prim_desc.weights_primitive_desc().desc().data.format);
+                        std::vector<shared_ptr<Node>> new_args;
+                        bool replace_node = false;
+                        uint index = 0;
                        for (const descriptor::Input& input : node->get_inputs())
                        {
                            const auto& output = input.get_output();
-                        auto output_tv = output.get_tensor_view();
+                            auto tv = output.get_tensor_view();
-                        auto output_tvl = output_tv->get_tensor_view_layout();
+                            auto tvt = tv->get_tensor_view_type();
+                            auto rank = tvt->get_shape().size();
-                        // TODO(jmenon): Propagate layout based on inputs
+                            auto tvl = tv->get_tensor_view_layout();
-                        // TODO(jmenon): Insert layout conversions when needed
+                            auto mkldnn_tvl =
+                                dynamic_cast<runtime::cpu::LayoutDescriptor*>(tvl.get());
+                            if (!mkldnn_tvl ||
+                                mkldnn_tvl->get_mkldnn_format() != prim_input_formats[index])
+                            {
+                                auto native_axis_order = ngraph::runtime::cpu::LayoutDescriptor::
+                                    create_native_axis_order(rank);
+                                auto layout =
+                                    std::make_shared<ngraph::runtime::cpu::LayoutDescriptor>(
+                                        *tv, native_axis_order);
+                                layout->set_mkldnn_format(prim_input_formats[index]);
+                                auto new_node =
+                                    std::shared_ptr<Node>(new runtime::cpu::op::ConvertLayout(
+                                        output.get_node(), output.get_index(), layout));
+                                new_args.push_back(new_node);
+                                replace_node = true;
+                                NGRAPH_DEBUG << "Inserted conversion node " << new_node->get_name()
+                                             << " between " << output.get_node()->get_name()
+                                             << "(layout: " << mkldnn_tvl->get_mkldnn_format()
+                                             << ") and " << node->get_name()
+                                             << "(layout: " << prim_input_formats[index] << ")";
+                            }
+                            else
+                            {
+                                new_args.push_back(node->get_input_op(index));
                            }
+                            index++;
+                        }
+                        shared_ptr<Node> new_node;
+                        if (replace_node)
+                        {
+                            new_node = node->copy_with_new_args(new_args);
+                            if (node->is_output())
+                            {
+                                external_function->get_function()->replace_node(node, new_node);
                            }
                            else
                            {
-                    layout->set_mkldnn_format(mkldnn::memory::format::format_undef);
+                                ngraph::replace_node(node, new_node);
+                            }
+                            NGRAPH_DEBUG << "Replaced " << node->get_name() << " with "
+                                         << new_node->get_name();
+                            auto old_op_annotations =
+                                static_pointer_cast<ngraph::op::Op>(node)->get_op_annotations();
+                            static_pointer_cast<ngraph::op::Op>(new_node)->set_op_annotations(
+                                old_op_annotations);
+                            node = new_node;
                        }
+                        // Set convolution output format
+                        for (size_t i = 0; i < node->get_output_size(); ++i)
+                        {
+                            auto tv = node->get_output_tensor_view(i);
+                            auto tvt = tv->get_tensor_view_type();
+                            auto rank = tvt->get_shape().size();
+                            auto tvl = tv->get_tensor_view_layout();
+                            if (tvl)
+                            {
+                                throw ngraph_error("Convolution output layout already set");
                            }
+                            auto native_axis_order =
+                                ngraph::runtime::cpu::LayoutDescriptor::create_native_axis_order(
+                                    rank);
+                            auto layout = std::make_shared<ngraph::runtime::cpu::LayoutDescriptor>(
+                                *tv, native_axis_order);
+                            layout->set_mkldnn_format(prim_output_formats[i]);
                            tv->set_tensor_view_layout(layout);
+                            NGRAPH_DEBUG << "Setting Node: " << node->get_name()
+                                         << " output layout: " << prim_output_formats[i] << endl;
+                        }
+                    }
+                    else
+                    {
+                        set_default_layouts(external_function, node);
+                    }
+                }
+            }
+        }
+    }
+}
+#define TI(x) type_index(typeid(x))
+static const runtime::cpu::pass::LayoutOpMap s_dispatcher{
+    {TI(ngraph::op::Convolution), &runtime::cpu::pass::CPULayout::layout<ngraph::op::Convolution>},
+};
+bool runtime::cpu::pass::CPULayout::run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes)
+{
+    for (const auto& node : nodes)
+    {
+        auto& n = *node;
+        auto handler = s_dispatcher.find(TI(n));
+        if (handler != s_dispatcher.end())
+        {
+            handler->second(m_external_function.get(), node);
+        }
+        else
+        {
+            set_default_layouts(m_external_function.get(), node);
        }
    }

--- a/src/ngraph/runtime/cpu/pass/cpu_layout.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.hpp
@@ -17,6 +17,11 @@
 #pragma once
 #include "ngraph/pass/pass.hpp"
+#include "ngraph/runtime/cpu/cpu_external_function.hpp"
+#define LAYOUT_DECL(op_type)                                                                       \
+    layout<op_type>(ngraph::runtime::cpu::CPU_ExternalFunction * external_function,                \
+                    std::shared_ptr<ngraph::Node> node)
 namespace ngraph
 {
@@ -26,11 +31,30 @@ namespace ngraph
        {
            namespace pass
            {
+                using LayoutFunction =
+                    std::function<void(CPU_ExternalFunction*, std::shared_ptr<ngraph::Node>)>;
+                using LayoutOpMap = std::unordered_map<std::type_index, LayoutFunction>;
                class CPULayout : public ngraph::pass::CallGraphPass
                {
                public:
+                    CPULayout(std::shared_ptr<CPU_ExternalFunction> external_function)
+                        : m_external_function(external_function)
+                    {
+                    }
                    virtual bool
                        run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes) override;
+                    template <typename OP>
+                    static void
+                        layout(ngraph::runtime::cpu::CPU_ExternalFunction* external_function,
+                               std::shared_ptr<ngraph::Node> node);
+                private:
+                    std::shared_ptr<CPU_ExternalFunction> m_external_function;
+                    static void set_default_layouts(CPU_ExternalFunction* external_function,
+                                                    std::shared_ptr<Node> node);
                };
            }
        }

--- a/src/ngraph/runtime/external_function.hpp
+++ b/src/ngraph/runtime/external_function.hpp
@@ -46,6 +46,7 @@ namespace ngraph
            virtual ~ExternalFunction() {}
            virtual std::shared_ptr<CallFrame> make_call_frame() = 0;
+            const std::shared_ptr<ngraph::Function> get_function() { return m_function; }
        protected:
            std::shared_ptr<ngraph::Function> m_function;
            bool m_release_function;

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -555,6 +555,53 @@ TEST(${BACKEND_NAME}, divide)
    EXPECT_EQ((vector<float>{2, 2, 2, 2}), read_vector<float>(result));
 }
+TEST(${BACKEND_NAME}, divide_adjoint_stability)
+{
+    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto backend = manager->allocate_backend();
+    Shape shape{2, 2};
+    auto make_external = [&]() {
+        auto A = make_shared<op::Parameter>(element::f32, shape);
+        auto B = make_shared<op::Parameter>(element::f32, shape);
+        auto f = make_shared<Function>(make_shared<op::Divide>(A, B), op::Parameters{A, B});
+        auto Y_out = f->get_output_op(0);
+        auto Xs = f->get_parameters();
+        auto C = std::make_shared<op::Parameter>(Y_out->get_element_type(), Y_out->get_shape());
+        std::vector<std::shared_ptr<Node>> dYdXs(Xs.size());
+        transform(Xs.begin(), Xs.end(), dYdXs.begin(), [C, Y_out](const std::shared_ptr<Node>& X) {
+            return Y_out->backprop_node(X, C);
+        });
+        std::vector<std::shared_ptr<op::Parameter>> params(Xs);
+        params.push_back(C);
+        auto bf = std::make_shared<Function>(dYdXs, params);
+        auto external = manager->compile(bf);
+        return external;
+    };
+    auto cf = backend->make_call_frame(make_external());
+    // Create some tensors for input/output
+    auto a = backend->make_primary_tensor_view(element::f32, shape);
+    copy_data(a, vector<float>{0, 0, 1, 1});
+    auto b = backend->make_primary_tensor_view(element::f32, shape);
+    copy_data(b, vector<float>{2, 2, 2, 2});
+    auto c = backend->make_primary_tensor_view(element::f32, shape);
+    copy_data(c, vector<float>{1, 1, 1, 1});
+    auto resulta = backend->make_primary_tensor_view(element::f32, shape);
+    auto resultb = backend->make_primary_tensor_view(element::f32, shape);
+    cf->call({a, b, c}, {resulta, resultb});
+    EXPECT_EQ((vector<float>{0.5, 0.5, 0.5, 0.5}), read_vector<float>(resulta));
+    EXPECT_EQ((vector<float>{-0.0, -0.0, -0.25, -0.25}), read_vector<float>(resultb));
+}
 TEST(${BACKEND_NAME}, divide_by_zero_float32)
 {
    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
@@ -6009,6 +6056,42 @@ TEST(${BACKEND_NAME}, convolution_outlining)
    EXPECT_EQ(vector<float>{expected_result}, read_vector<float>(result));
 }
+TEST(${BACKEND_NAME}, convolution_layout)
+{
+    Shape shape_a{1, 16, 2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{32, 16, 1, 1};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{1, 32, 2, 2};
+    auto conv1 = make_shared<op::Convolution>(A,
+                                              B,
+                                              Strides{1, 1},
+                                              Strides{1, 1},
+                                              CoordinateDiff{0, 0},
+                                              CoordinateDiff{0, 0},
+                                              Strides{1, 1});
+    auto f = make_shared<Function>(conv1, op::Parameters{A, B});
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+    // Create some tensors for input/output
+    auto a = backend->make_primary_tensor_view(element::f32, shape_a);
+    vector<float> input(64, 1.0f);
+    copy_data(a, input);
+    auto b = backend->make_primary_tensor_view(element::f32, shape_b);
+    vector<float> weights(512, 1.0f);
+    copy_data(b, weights);
+    auto result = backend->make_primary_tensor_view(element::f32, shape_r);
+    vector<float> expected_result(128, 16.0f);
+    cf->call({a, b}, {result});
+    EXPECT_EQ(vector<float>{expected_result}, read_vector<float>(result));
+}
 TEST(${BACKEND_NAME}, avg_pool_1d_1channel_1image)
 {
    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");

--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp
@@ -288,3 +288,23 @@ TEST(cpu_fusion, fuse_fprop_bn)
    size_t ccg = count_ops_of_type<op::BatchNorm>(func);
    ASSERT_EQ(ccg, 1);
 }
+class UnhandledOp : public ngraph::op::Abs
+{
+public:
+    UnhandledOp(const std::shared_ptr<Node>& arg)
+        : Abs(arg)
+    {
+    }
+};
+TEST(cpu_fusion, unhandled_op)
+{
+    auto A = make_shared<op::Parameter>(element::f32, Shape{});
+    auto unhandled = make_shared<UnhandledOp>(A);
+    auto f = make_shared<Function>(unhandled, op::Parameters{A});
+    auto manager = runtime::Manager::get("CPU");
+    auto backend = manager->allocate_backend();
+    auto external = manager->compile(f);
+    ASSERT_THROW(backend->make_call_frame(external), ngraph_error);
+}