Merge remote-tracking branch 'origin/master' into pruthvi/mkldnn_elementwise_add

eabfebe5 · pthoreho · 16a18d29 · d0f8dff2 · eabfebe5 · eabfebe5
Commit eabfebe5 authored Feb 22, 2018 by pthoreho
25 changed files
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -183,6 +183,7 @@ if (NGRAPH_CPU_ENABLE AND LLVM_INCLUDE_DIR AND
        runtime/cpu/mkldnn_utils.cpp
        runtime/cpu/ops/convert_layout.cpp
        runtime/cpu/ops/matmul_bias.cpp
+        runtime/cpu/pass/cpu_assignment.cpp
        runtime/cpu/pass/cpu_fusion.cpp
        runtime/cpu/pass/cpu_layout.cpp
    )

--- a/src/ngraph/builder/reduce_ops.cpp
+++ b/src/ngraph/builder/reduce_ops.cpp
@@ -14,8 +14,10 @@
 * limitations under the License.
 *******************************************************************************/
-#include "ngraph/builder/reduce_ops.hpp"
+#include <numeric>
 #include "ngraph/builder/autobroadcast.hpp"
+#include "ngraph/builder/reduce_ops.hpp"
 #include "ngraph/ops/add.hpp"
 #include "ngraph/ops/divide.hpp"
 #include "ngraph/ops/multiply.hpp"

--- a/src/ngraph/codegen/code_writer.cpp
+++ b/src/ngraph/codegen/code_writer.cpp
@@ -40,7 +40,7 @@ std::string codegen::CodeWriter::generate_temporary_name(std::string prefix)
 {
    std::stringstream ss;
-    ss << prefix << "__" << m_temporary_name_count;
+    ss << prefix << m_temporary_name_count;
    m_temporary_name_count++;
    return ss.str();

--- a/src/ngraph/ops/divide.cpp
+++ b/src/ngraph/ops/divide.cpp
@@ -24,6 +24,6 @@ void ngraph::op::Divide::generate_adjoints(autodiff::Adjoints& adjoints,
    auto x = get_input_op(0);
    auto y = get_input_op(1);
-    adjoints.add_delta(x, delta * shared_from_this() / x);
+    adjoints.add_delta(x, delta / y);
    adjoints.add_delta(y, -delta * shared_from_this() / y);
 }
--- a/src/ngraph/ops/op.hpp
+++ b/src/ngraph/ops/op.hpp
@@ -20,6 +20,7 @@
 #include "ngraph/common.hpp"
 #include "ngraph/node.hpp"
+#include "ngraph/ops/util/op_annotations.hpp"
 namespace ngraph
 {
@@ -28,8 +29,21 @@ namespace ngraph
        /// Root of all actual ops
        class Op : public Node
        {
+        public:
+            void set_op_annotations(std::shared_ptr<ngraph::op::util::OpAnnotations> op_annotations)
+            {
+                m_op_annotations = op_annotations;
+            }
+            std::shared_ptr<ngraph::op::util::OpAnnotations> get_op_annotations() const
+            {
+                return m_op_annotations;
+            }
        protected:
            Op(const std::string& node_type, const Nodes& arguments);
+        private:
+            std::shared_ptr<ngraph::op::util::OpAnnotations> m_op_annotations;
        };
    }
 }
--- a/src/ngraph/ops/util/op_annotations.hpp
+++ b/src/ngraph/ops/util/op_annotations.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+namespace ngraph
+{
+    namespace op
+    {
+        namespace util
+        {
+            /// \brief Abstract base class for annotations added to graph ops
+            class OpAnnotations
+            {
+            public:
+                OpAnnotations() {}
+            };
+        }
+    }
+}
--- a/src/ngraph/pattern/core_fusion.cpp
+++ b/src/ngraph/pattern/core_fusion.cpp
@@ -15,13 +15,13 @@
 *******************************************************************************/
 #include <algorithm>
-#include <iostream>
 #include <unordered_set>
 #include "ngraph/pattern/core_fusion.hpp"
 #include "ngraph/graph_util.hpp"
 #include "ngraph/log.hpp"
+#include "ngraph/ops/broadcast.hpp"
 #include "ngraph/ops/constant.hpp"
 #include "ngraph/ops/maximum.hpp"
 #include "ngraph/ops/parameter.hpp"
@@ -51,7 +51,12 @@ void pass::CoreFusion::construct_relu_pattern()
    auto iconst0 = construct_constant_node(0);
    auto val = make_shared<pattern::op::Label>(iconst0);
    auto zero = make_shared<pattern::op::Label>(iconst0, nullptr, Nodes{iconst0});
-    auto max = make_shared<op::Maximum>(zero, val);
+    auto broadcast_pred = [](std::shared_ptr<Node> n) {
+        return static_cast<bool>(std::dynamic_pointer_cast<op::Broadcast>(n));
+    };
+    auto skip_broadcast = std::make_shared<pattern::op::Any>(zero, broadcast_pred);
+    auto max = make_shared<op::Maximum>(skip_broadcast, val);
    pattern::gr_callback_fn callback = [val, zero](pattern::Matcher& m) {
        NGRAPH_DEBUG << "In a callback for construct_relu_pattern against "

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -106,7 +106,9 @@
 #include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
 #include "ngraph/runtime/cpu/cpu_tracing.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+#include "ngraph/runtime/cpu/ops/convert_layout.hpp"
 #include "ngraph/runtime/cpu/ops/matmul_bias.hpp"
+#include "ngraph/runtime/cpu/pass/cpu_assignment.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_fusion.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_layout.hpp"
@@ -218,6 +220,8 @@ static const runtime::cpu::OpMap dispatcher{
     &runtime::cpu::CPU_Emitter::emit<op::ConvolutionBackpropFilters>},
    {TI(ngraph::op::ConvolutionBackpropData),
     &runtime::cpu::CPU_Emitter::emit<op::ConvolutionBackpropData>},
+    {TI(ngraph::runtime::cpu::op::ConvertLayout),
+     &runtime::cpu::CPU_Emitter::emit<runtime::cpu::op::ConvertLayout>},
    {TI(ngraph::op::Not), &runtime::cpu::CPU_Emitter::emit<op::Not>},
    {TI(ngraph::op::MaxPool), &runtime::cpu::CPU_Emitter::emit<op::MaxPool>},
    {TI(ngraph::op::Reverse), &runtime::cpu::CPU_Emitter::emit<op::Reverse>},
@@ -259,7 +263,8 @@ void runtime::cpu::CPU_ExternalFunction::compile()
    pass_manager.register_pass<ngraph::pass::CoreFusion>();
    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
-    pass_manager.register_pass<runtime::cpu::pass::CPULayout>();
+    pass_manager.register_pass<runtime::cpu::pass::CPUAssignment>(shared_from_this());
+    pass_manager.register_pass<runtime::cpu::pass::CPULayout>(shared_from_this());
    pass_manager.register_pass<ngraph::pass::Liveness>();
    pass_manager.register_pass<ngraph::pass::MemoryLayout>(s_memory_pool_alignment);
@@ -279,13 +284,8 @@ void runtime::cpu::CPU_ExternalFunction::compile()
    }
    writer +=
-        R"(// Generated by the NGraph CPU backend
+        R"(// Generated by the nGraph CPU backend
 #include <cmath>
-)";
-    writer +=
-        R"(#include <Eigen/Dense>
 #include "ngraph/except.hpp"
 #include "ngraph/runtime/aligned_buffer.hpp"
 #include "ngraph/runtime/cpu/cpu_eigen_utils.hpp"
@@ -417,7 +417,7 @@ using namespace ngraph::runtime;
    {
        for (shared_ptr<Node> node : current_function->get_ordered_ops())
        {
-            const op::Constant* c = dynamic_cast<op::Constant*>(node.get());
+            const ngraph::op::Constant* c = dynamic_cast<ngraph::op::Constant*>(node.get());
            if (c)
            {
                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
@@ -462,7 +462,14 @@ using namespace ngraph::runtime;
            {
                continue;
            }
            Node& node = *op_list[i];
+            auto handler = dispatcher.find(type_index(typeid(node)));
+            if (handler == dispatcher.end())
+            {
+                throw ngraph_error("Unhandled op during code generation : " + node.description());
+            }
            string s = emit_op_as_function(node, "f");
            node_cache.insert({&node, s});
        }
@@ -509,7 +516,7 @@ using namespace ngraph::runtime;
        set<descriptor::TensorView*> constants;
        for (shared_ptr<Node> node : current_function->get_ordered_ops())
        {
-            if (dynamic_cast<op::Constant*>(node.get()))
+            if (dynamic_cast<ngraph::op::Constant*>(node.get()))
            {
                shared_ptr<descriptor::TensorView> tv = node->get_outputs()[0].get_tensor_view();
                constants.insert(tv.get());
@@ -573,7 +580,7 @@ using namespace ngraph::runtime;
        // Add inputs to the variable name map
        size_t arg_index = 0;
-        for (shared_ptr<op::Parameter> param : current_function->get_parameters())
+        for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
        {
            for (size_t i = 0; i < param->get_output_size(); ++i)
            {
@@ -612,7 +619,7 @@ using namespace ngraph::runtime;
            shared_ptr<descriptor::TensorView> tv = op->get_output_tensor_view();
            const element::Type& et = tv->get_tensor_view_type()->get_element_type();
            bool parameter_as_output = false;
-            for (shared_ptr<op::Parameter> param : current_function->get_parameters())
+            for (shared_ptr<ngraph::op::Parameter> param : current_function->get_parameters())
            {
                for (const descriptor::Output& pout : param->get_outputs())
                {
@@ -657,7 +664,8 @@ using namespace ngraph::runtime;
                throw ngraph_error("Unhandled op during code generation : " + node->description());
            }
            vector<TensorViewWrapper> in;
-            vector<string> node_input_names, node_output_names;
+            vector<string> node_input_names;
+            vector<string> node_output_names;
            for (const descriptor::Input& input : node->get_inputs())
            {
                const descriptor::Output& output = input.get_output();
@@ -702,19 +710,23 @@ using namespace ngraph::runtime;
                }
            }
+            writer << "\n// " << node->get_name() << "(";
+            vector<string> parameter_nodes = node_input_names;
+            parameter_nodes.insert(
+                parameter_nodes.end(), node_output_names.begin(), node_output_names.end());
+            writer << join(parameter_nodes);
+            writer << ")\n";
            // Emit operation body
            string func_name;
            auto it = match_functions.find(node.get());
-            if (it != match_functions.end())
+            if (it == match_functions.end())
-            {
-                func_name = it->second;
-            }
-            if (func_name.empty())
            {
                handler->second(this, writer, node.get(), in, out);
            }
            else
            {
+                func_name = it->second;
                vector<string> names;
                for (const TensorViewWrapper& tv : in)
                {

--- a/src/ngraph/runtime/cpu/cpu_kernel_emitters.cpp
+++ b/src/ngraph/runtime/cpu/cpu_kernel_emitters.cpp
@@ -30,9 +30,6 @@ string emit_bracketed_string(vector<T> data)
 {
    stringstream ss;
-    if (data.size() == 0)
-        return "";
    for (auto s : data)
    {
        ss << "[" << s << "]";
@@ -75,7 +72,7 @@ vector<string>
    vector<string> index_vars;
    for (size_t i = 0; i < top.size(); i++)
    {
-        string index_var = writer.generate_temporary_name("i");
+        string index_var = writer.generate_temporary_name("_i");
        writer << runtime::cpu::kernel::start_index_loop(index_var, new_bottom[i], top[i], i == 0);
        writer.indent++;

--- a/src/ngraph/runtime/cpu/cpu_kernel_utils.cpp
+++ b/src/ngraph/runtime/cpu/cpu_kernel_utils.cpp
@@ -149,7 +149,7 @@ string ngraph::runtime::cpu::kernel::end_index_loop(const string& index_var)
 {
    stringstream ss;
-    ss << "} // end for(" << index_var << ")\n";
+    ss << "}\n";
    return ss.str();
 }
@@ -209,7 +209,7 @@ void ngraph::runtime::cpu::kernel::emit_pointwise_copy(codegen::CodeWriter& writ
    for (size_t i = 0; i < n_axes; i++)
    {
-        string index_var = writer.generate_temporary_name("i");
+        string index_var = writer.generate_temporary_name("_j");
        writer << start_index_loop(index_var, source_start_corner[i], source_end_corner[i], i == 0);
        writer.indent++;

--- a/src/ngraph/runtime/cpu/cpu_op_annotations.hpp
+++ b/src/ngraph/runtime/cpu/cpu_op_annotations.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include "ngraph/ops/util/op_annotations.hpp"
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            /// \brief Annotations added to graph ops by CPU backend passes
+            class CPUOpAnnotations : public ngraph::op::util::OpAnnotations
+            {
+            public:
+                CPUOpAnnotations() { m_mkldnn_op = false; }
+                bool is_mkldnn_op() { return m_mkldnn_op; }
+                void set_mkldnn_op(bool val) { m_mkldnn_op = val; }
+            private:
+                bool m_mkldnn_op;
+            };
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/cpu_tensor_view.cpp
+++ b/src/ngraph/runtime/cpu/cpu_tensor_view.cpp
@@ -22,8 +22,10 @@
 #include "ngraph/descriptor/primary_tensor_view.hpp"
 #include "ngraph/except.hpp"
 #include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
+#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
 #include "ngraph/shape.hpp"
+using namespace mkldnn;
 using namespace ngraph;
 using namespace std;
@@ -101,8 +103,34 @@ void runtime::cpu::CPUTensorView::read(void* target, size_t tensor_offset, size_
    {
        throw out_of_range("read access past end of tensor");
    }
+    auto tvl = this->get_tensor_view_layout();
+    auto cpu_tvl = dynamic_cast<runtime::cpu::LayoutDescriptor*>(tvl.get());
+    if (cpu_tvl && cpu_tvl->get_mkldnn_format() != memory::format::format_undef &&
+        cpu_tvl->get_mkldnn_format() !=
+            runtime::cpu::mkldnn_utils::CreateNativeDataFormat(*cpu_tvl))
+    {
+        auto tensor_shape = this->get_shape();
+        auto input_format = cpu_tvl->get_mkldnn_format();
+        auto output_format = runtime::cpu::mkldnn_utils::CreateNativeDataFormat(*cpu_tvl);
+        memory::data_type et = runtime::cpu::mkldnn_utils::get_mkldnn_data_type(
+            this->get_descriptor()->get_tensor_view_type()->get_element_type());
+        engine cpu_engine{engine::cpu, 0};
+        memory::dims mkldnn_shape{tensor_shape.begin(), tensor_shape.end()};
+        memory::desc input_desc{mkldnn_shape, et, input_format};
+        memory::desc output_desc{mkldnn_shape, et, output_format};
+        memory input{{input_desc, cpu_engine}, aligned_buffer};
+        memory output{{output_desc, cpu_engine}, target};
+        reorder prim{input, output};
+        mkldnn::stream s(mkldnn::stream::kind::eager);
+        s.submit({prim}).wait();
+    }
+    else
+    {
        const char* source = get_data_ptr();
        memcpy(target, &source[tensor_offset], n);
+    }
 }
 size_t runtime::cpu::CPUTensorView::get_size() const

--- a/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
@@ -15,6 +15,7 @@
 *******************************************************************************/
 #include <memory>
+#include <string>
 #include "mkldnn_emitter.hpp"
@@ -45,7 +46,7 @@ mkldnn::memory::desc MKLDNNEmitter::build_memory_descriptor(const TensorViewWrap
 {
    return mkldnn::memory::desc(
        mkldnn::memory::dims(tvw.get_shape().begin(), tvw.get_shape().end()),
-        mkldnn_utils::GetDataType(tvw.get_element_type()),
+        mkldnn_utils::get_mkldnn_data_type(tvw.get_element_type()),
        fmt);
 }

--- a/src/ngraph/runtime/cpu/mkldnn_invoke.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_invoke.cpp
@@ -14,6 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
+#include <string>
 #include <mkldnn.hpp>
 #include "mkldnn_invoke.hpp"

--- a/src/ngraph/runtime/cpu/mkldnn_utils.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.cpp
@@ -17,9 +17,9 @@
 #include <string>
 #include <typeindex>
 #include <typeinfo>
-#include <unordered_map>
 #include <unordered_set>
+#include "ngraph/types/element_type.hpp"
 #include "ngraph/node.hpp"
 #include "ngraph/ops/add.hpp"
 #include "ngraph/ops/avg_pool.hpp"
@@ -30,17 +30,12 @@
 #include "mkldnn_utils.hpp"
-namespace ngraph
+using namespace mkldnn;
-{
+using namespace ngraph;
-    namespace runtime
-    {
-        namespace cpu
-        {
-            namespace mkldnn_utils
-            {
 #define TI(x) std::type_index(typeid(x))
-                static const std::unordered_set<std::type_index> s_op_registry{
+static const std::unordered_set<std::type_index> s_op_registry{
    TI(ngraph::op::Add),
    TI(ngraph::op::AvgPool),
    TI(ngraph::op::AvgPoolBackprop),
@@ -53,36 +48,71 @@ namespace ngraph
    TI(ngraph::op::Relu),
    TI(ngraph::op::ReluBackprop)};
-                static const std::unordered_map<std::string, const mkldnn::memory::data_type>
+// Mapping from POD types to MKLDNN data types
-                    s_data_type_map{{"char", mkldnn::memory::data_type::s8},
+static const std::map<element::Type, const mkldnn::memory::data_type> s_mkldnn_data_type_map{
-                                    {"float", mkldnn::memory::data_type::f32},
+    {element::boolean, mkldnn::memory::data_type::s8},
-                                    {"double", mkldnn::memory::data_type::data_undef},
+    {element::f32, mkldnn::memory::data_type::f32},
-                                    {"int8_t", mkldnn::memory::data_type::s8},
+    {element::f64, mkldnn::memory::data_type::data_undef},
-                                    {"int16_t", mkldnn::memory::data_type::s16},
+    {element::i8, mkldnn::memory::data_type::s8},
-                                    {"int32_t", mkldnn::memory::data_type::s32},
+    {element::i16, mkldnn::memory::data_type::s16},
-                                    {"int64_t", mkldnn::memory::data_type::data_undef},
+    {element::i32, mkldnn::memory::data_type::s32},
-                                    {"uint8_t", mkldnn::memory::data_type::u8},
+    {element::i64, mkldnn::memory::data_type::data_undef},
-                                    {"uint16_t", mkldnn::memory::data_type::data_undef},
+    {element::u8, mkldnn::memory::data_type::u8},
-                                    {"uint32_t", mkldnn::memory::data_type::data_undef},
+    {element::u16, mkldnn::memory::data_type::data_undef},
-                                    {"uint64_t", mkldnn::memory::data_type::data_undef}};
+    {element::u32, mkldnn::memory::data_type::data_undef},
+    {element::u64, mkldnn::memory::data_type::data_undef}};
-                mkldnn::memory::data_type GetDataType(const ngraph::element::Type& et)
+static const std::map<element::Type, const std::string> s_mkldnn_data_type_string_map{
-                {
+    {element::boolean, "mkldnn::memory::data_type::s8"},
-                    auto it = s_data_type_map.find(et.c_type_string());
+    {element::f32, "mkldnn::memory::data_type::f32"},
-                    if (it == s_data_type_map.end() ||
+    {element::f64, "mkldnn::memory::data_type::data_undef"},
-                        it->second == mkldnn::memory::data_type::data_undef)
+    {element::i8, "mkldnn::memory::data_type::s8"},
-                        throw ngraph_error("No MKLDNN data type exists for the given element type");
+    {element::i16, "mkldnn::memory::data_type::s16"},
-                    return it->second;
+    {element::i32, "mkldnn::memory::data_type::s32"},
-                }
+    {element::i64, "mkldnn::memory::data_type::data_undef"},
+    {element::u8, "mkldnn::memory::data_type::u8"},
+    {element::u16, "mkldnn::memory::data_type::data_undef"},
+    {element::u32, "mkldnn::memory::data_type::data_undef"},
+    {element::u64, "mkldnn::memory::data_type::data_undef"}};
-                bool IsMKLDNNOp(ngraph::Node& op)
+// TODO (jbobba): Add the rest of memory formats to this map as well
-                {
+static const std::map<memory::format, const std::string> s_mkldnn_format_string_map{
+    {memory::format::format_undef, "memory::format::format_undef"},
+    {memory::format::any, "memory::format::any"},
+    {memory::format::blocked, "memory::format::blocked"},
+    {memory::format::x, "memory::format::x"},
+    {memory::format::nc, "memory::format::nc"},
+    {memory::format::nchw, "memory::format::nchw"},
+    {memory::format::nhwc, "memory::format::nhwc"},
+    {memory::format::chwn, "memory::format::chwn"},
+    {memory::format::nChw8c, "memory::format::nChw8c"},
+    {memory::format::nChw16c, "memory::format::nChw16c"},
+    {memory::format::oi, "memory::format::oi"},
+    {memory::format::io, "memory::format::io"},
+    {memory::format::oihw, "memory::format::oihw"},
+    {memory::format::ihwo, "memory::format::ihwo"},
+    {memory::format::hwio, "memory::format::hwio"},
+    {memory::format::oIhw8i, "memory::format::oIhw8i"},
+    {memory::format::oIhw16i, "memory::format::oIhw16i"},
+    {memory::format::OIhw8i8o, "memory::format::OIhw8i8o"},
+    {memory::format::OIhw16i16o, "memory::format::OIhw16i16o"},
+    {memory::format::OIhw8o8i, "memory::format::OIhw8o8i"},
+    {memory::format::OIhw16o16i, "memory::format::OIhw16o16i"},
+    {memory::format::Oihw8o, "memory::format::Oihw8o"},
+    {memory::format::Oihw16o, "memory::format::Oihw16o"},
+    {memory::format::Ohwi8o, "memory::format::Ohwi8o"},
+    {memory::format::Ohwi16o, "memory::format::Ohwi16o"},
+    {memory::format::OhIw16o4i, "memory::format::OhIw16o4i"},
+};
+bool runtime::cpu::mkldnn_utils::IsMKLDNNOp(ngraph::Node& op)
+{
    return (s_op_registry.find(TI(op)) != s_op_registry.end());
-                }
+}
-                mkldnn::memory::format
+mkldnn::memory::format runtime::cpu::mkldnn_utils::CreateNativeDataFormat(
-                    CreateNativeDataFormat(const ngraph::runtime::cpu::LayoutDescriptor& layout)
+    const ngraph::runtime::cpu::LayoutDescriptor& layout)
-                {
+{
    switch (layout.get_shape().size())
    {
    case 1: return mkldnn::memory::format::x;
@@ -90,8 +120,31 @@ namespace ngraph
    case 4: return mkldnn::memory::format::nchw;
    default: return mkldnn::memory::format::format_undef;
    }
+}
+const std::string& runtime::cpu::mkldnn_utils::get_mkldnn_data_type_string(const ngraph::element::Type& type)
+{
+    auto it = s_mkldnn_data_type_string_map.find(type);
+    if (it == s_mkldnn_data_type_string_map.end() || it->second.empty())
+        throw ngraph_error("No MKLDNN data type exists for the given element type");
+    return it->second;
+}
+mkldnn::memory::data_type runtime::cpu::mkldnn_utils::get_mkldnn_data_type(const ngraph::element::Type& type)
+{
+    auto it = s_mkldnn_data_type_map.find(type);
+    if (it == s_mkldnn_data_type_map.end() || it->second == memory::data_type::data_undef)
+    {
+        throw ngraph_error("No MKLDNN data type exists for the given element type");
    }
-            }
+    return it->second;
-        }
+}
-    }
+const std::string& runtime::cpu::mkldnn_utils::get_mkldnn_format_string(memory::format fmt)
+{
+    auto it = s_mkldnn_format_string_map.find(fmt);
+    if (it == s_mkldnn_format_string_map.end())
+        throw ngraph_error("No MKLDNN format exists for the given format type " +
+                           std::to_string(fmt));
+    return it->second;
 }
--- a/src/ngraph/runtime/cpu/mkldnn_utils.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.hpp
@@ -32,12 +32,12 @@ namespace ngraph
            {
                extern mkldnn::engine global_cpu_engine;
-                mkldnn::memory::data_type GetDataType(const ngraph::element::Type& et);
                bool IsMKLDNNOp(ngraph::Node& op);
                mkldnn::memory::format
                    CreateNativeDataFormat(const ngraph::runtime::cpu::LayoutDescriptor& layout);
+                const std::string& get_mkldnn_data_type_string(const ngraph::element::Type& type);
+                mkldnn::memory::data_type get_mkldnn_data_type(const ngraph::element::Type& type);
+                const std::string& get_mkldnn_format_string(mkldnn::memory::format fmt);
            }
        }
    }

--- a/src/ngraph/runtime/cpu/ops/convert_layout.cpp
+++ b/src/ngraph/runtime/cpu/ops/convert_layout.cpp
@@ -39,7 +39,7 @@ runtime::cpu::op::ConvertLayout::ConvertLayout(
    if (!arg_layout)
    {
-        throw ngraph_error("Layout conversion input tensor is missing layout information");
+        //throw ngraph_error("Layout conversion input tensor is missing layout information");
    }
    add_output(layout->get_element_type(), layout->get_shape());

--- a/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "ngraph/runtime/cpu/pass/cpu_assignment.hpp"
+#include <algorithm>
+#include <cassert>
+#include <memory>
+#include <typeindex>
+#include <typeinfo>
+#include <unordered_map>
+#include <mkldnn.hpp>
+#include "ngraph/descriptor/output.hpp"
+#include "ngraph/ops/convolution.hpp"
+#include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
+#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace pass
+            {
+                template <>
+                void CPUAssignment::ASSIGN_DECL(ngraph::op::Convolution)
+                {
+                    auto convolution = static_cast<op::Convolution*>(node);
+                    auto arg0_shape = node->get_input_shape(0);
+                    auto arg1_shape = node->get_input_shape(1);
+                    auto result_shape = node->get_output_shape(0);
+                    auto arg0_rank = arg0_shape.size();
+                    auto arg1_rank = arg1_shape.size();
+                    bool data_dilated = false;
+                    for (size_t s : convolution->get_data_dilation_strides())
+                    {
+                        data_dilated = data_dilated || (s != 1);
+                    }
+                    if (!data_dilated && arg0_rank == 4 && arg1_rank == 4 &&
+                        node->get_input_element_type(0) == element::f32)
+                    {
+                        auto op_annotations =
+                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                        op_annotations->set_mkldnn_op(true);
+                        convolution->set_op_annotations(op_annotations);
+                    }
+                }
+            }
+        }
+    }
+}
+#define TI(x) type_index(typeid(x))
+static const runtime::cpu::pass::AssignOpMap s_dispatcher{
+    {TI(ngraph::op::Convolution),
+     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Convolution>},
+};
+bool runtime::cpu::pass::CPUAssignment::run_on_call_graph(
+    const std::list<std::shared_ptr<Node>>& nodes)
+{
+    for (const auto& node : nodes)
+    {
+        auto& n = *node;
+        auto handler = s_dispatcher.find(TI(n));
+        if (handler != s_dispatcher.end())
+        {
+            handler->second(m_external_function.get(), node.get());
+        }
+    }
+    return false;
+}
--- a/src/ngraph/runtime/cpu/pass/cpu_assignment.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_assignment.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include "ngraph/pass/pass.hpp"
+#include "ngraph/runtime/cpu/cpu_external_function.hpp"
+#include "ngraph/runtime/cpu/cpu_tensor_view.hpp"
+#define ASSIGN_DECL(op_name)                                                                       \
+    assign<op_name>(ngraph::runtime::cpu::CPU_ExternalFunction * external_function,                \
+                    ngraph::Node * node)
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            namespace pass
+            {
+                using AssignFunction = std::function<void(CPU_ExternalFunction*, ngraph::Node*)>;
+                using AssignOpMap = std::unordered_map<std::type_index, AssignFunction>;
+                class CPUAssignment : public ngraph::pass::CallGraphPass
+                {
+                public:
+                    CPUAssignment(std::shared_ptr<CPU_ExternalFunction> external_function)
+                        : m_external_function(external_function)
+                    {
+                    }
+                    virtual bool
+                        run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes) override;
+                    template <typename OP>
+                    static void
+                        assign(ngraph::runtime::cpu::CPU_ExternalFunction* external_function,
+                               ngraph::Node* node)
+                    {
+                        throw std::runtime_error("Unimplemented op in CPU assignment");
+                    }
+                private:
+                    std::shared_ptr<CPU_ExternalFunction> m_external_function;
+                };
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
--- a/src/ngraph/runtime/cpu/pass/cpu_layout.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.hpp
@@ -17,6 +17,11 @@
 #pragma once
 #include "ngraph/pass/pass.hpp"
+#include "ngraph/runtime/cpu/cpu_external_function.hpp"
+#define LAYOUT_DECL(op_type)                                                                       \
+    layout<op_type>(ngraph::runtime::cpu::CPU_ExternalFunction * external_function,                \
+                    std::shared_ptr<ngraph::Node> node)
 namespace ngraph
 {
@@ -26,11 +31,30 @@ namespace ngraph
        {
            namespace pass
            {
+                using LayoutFunction =
+                    std::function<void(CPU_ExternalFunction*, std::shared_ptr<ngraph::Node>)>;
+                using LayoutOpMap = std::unordered_map<std::type_index, LayoutFunction>;
                class CPULayout : public ngraph::pass::CallGraphPass
                {
                public:
+                    CPULayout(std::shared_ptr<CPU_ExternalFunction> external_function)
+                        : m_external_function(external_function)
+                    {
+                    }
                    virtual bool
                        run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes) override;
+                    template <typename OP>
+                    static void
+                        layout(ngraph::runtime::cpu::CPU_ExternalFunction* external_function,
+                               std::shared_ptr<ngraph::Node> node);
+                private:
+                    std::shared_ptr<CPU_ExternalFunction> m_external_function;
+                    static void set_default_layouts(CPU_ExternalFunction* external_function,
+                                                    std::shared_ptr<Node> node);
                };
            }
        }

--- a/src/ngraph/runtime/external_function.hpp
+++ b/src/ngraph/runtime/external_function.hpp
@@ -46,6 +46,7 @@ namespace ngraph
            virtual ~ExternalFunction() {}
            virtual std::shared_ptr<CallFrame> make_call_frame() = 0;
+            const std::shared_ptr<ngraph::Function> get_function() { return m_function; }
        protected:
            std::shared_ptr<ngraph::Function> m_function;
            bool m_release_function;

--- a/test/backend_test.in.cpp
+++ b/test/backend_test.in.cpp
@@ -555,6 +555,53 @@ TEST(${BACKEND_NAME}, divide)
    EXPECT_EQ((vector<float>{2, 2, 2, 2}), read_vector<float>(result));
 }
+TEST(${BACKEND_NAME}, divide_adjoint_stability)
+{
+    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto backend = manager->allocate_backend();
+    Shape shape{2, 2};
+    auto make_external = [&]() {
+        auto A = make_shared<op::Parameter>(element::f32, shape);
+        auto B = make_shared<op::Parameter>(element::f32, shape);
+        auto f = make_shared<Function>(make_shared<op::Divide>(A, B), op::Parameters{A, B});
+        auto Y_out = f->get_output_op(0);
+        auto Xs = f->get_parameters();
+        auto C = std::make_shared<op::Parameter>(Y_out->get_element_type(), Y_out->get_shape());
+        std::vector<std::shared_ptr<Node>> dYdXs(Xs.size());
+        transform(Xs.begin(), Xs.end(), dYdXs.begin(), [C, Y_out](const std::shared_ptr<Node>& X) {
+            return Y_out->backprop_node(X, C);
+        });
+        std::vector<std::shared_ptr<op::Parameter>> params(Xs);
+        params.push_back(C);
+        auto bf = std::make_shared<Function>(dYdXs, params);
+        auto external = manager->compile(bf);
+        return external;
+    };
+    auto cf = backend->make_call_frame(make_external());
+    // Create some tensors for input/output
+    auto a = backend->make_primary_tensor_view(element::f32, shape);
+    copy_data(a, vector<float>{0, 0, 1, 1});
+    auto b = backend->make_primary_tensor_view(element::f32, shape);
+    copy_data(b, vector<float>{2, 2, 2, 2});
+    auto c = backend->make_primary_tensor_view(element::f32, shape);
+    copy_data(c, vector<float>{1, 1, 1, 1});
+    auto resulta = backend->make_primary_tensor_view(element::f32, shape);
+    auto resultb = backend->make_primary_tensor_view(element::f32, shape);
+    cf->call({a, b, c}, {resulta, resultb});
+    EXPECT_EQ((vector<float>{0.5, 0.5, 0.5, 0.5}), read_vector<float>(resulta));
+    EXPECT_EQ((vector<float>{-0.0, -0.0, -0.25, -0.25}), read_vector<float>(resultb));
+}
 TEST(${BACKEND_NAME}, divide_by_zero_float32)
 {
    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");
@@ -6009,6 +6056,42 @@ TEST(${BACKEND_NAME}, convolution_outlining)
    EXPECT_EQ(vector<float>{expected_result}, read_vector<float>(result));
 }
+TEST(${BACKEND_NAME}, convolution_layout)
+{
+    Shape shape_a{1, 16, 2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shape_a);
+    Shape shape_b{32, 16, 1, 1};
+    auto B = make_shared<op::Parameter>(element::f32, shape_b);
+    Shape shape_r{1, 32, 2, 2};
+    auto conv1 = make_shared<op::Convolution>(A,
+                                              B,
+                                              Strides{1, 1},
+                                              Strides{1, 1},
+                                              CoordinateDiff{0, 0},
+                                              CoordinateDiff{0, 0},
+                                              Strides{1, 1});
+    auto f = make_shared<Function>(conv1, op::Parameters{A, B});
+    auto manager = runtime::Manager::get("${BACKEND_NAME}");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+    // Create some tensors for input/output
+    auto a = backend->make_primary_tensor_view(element::f32, shape_a);
+    vector<float> input(64, 1.0f);
+    copy_data(a, input);
+    auto b = backend->make_primary_tensor_view(element::f32, shape_b);
+    vector<float> weights(512, 1.0f);
+    copy_data(b, weights);
+    auto result = backend->make_primary_tensor_view(element::f32, shape_r);
+    vector<float> expected_result(128, 16.0f);
+    cf->call({a, b}, {result});
+    EXPECT_EQ(vector<float>{expected_result}, read_vector<float>(result));
+}
 TEST(${BACKEND_NAME}, avg_pool_1d_1channel_1image)
 {
    SKIP_TEST_FOR("GPU", "${BACKEND_NAME}");

--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp
@@ -288,3 +288,23 @@ TEST(cpu_fusion, fuse_fprop_bn)
    size_t ccg = count_ops_of_type<op::BatchNorm>(func);
    ASSERT_EQ(ccg, 1);
 }
+class UnhandledOp : public ngraph::op::Abs
+{
+public:
+    UnhandledOp(const std::shared_ptr<Node>& arg)
+        : Abs(arg)
+    {
+    }
+};
+TEST(cpu_fusion, unhandled_op)
+{
+    auto A = make_shared<op::Parameter>(element::f32, Shape{});
+    auto unhandled = make_shared<UnhandledOp>(A);
+    auto f = make_shared<Function>(unhandled, op::Parameters{A});
+    auto manager = runtime::Manager::get("CPU");
+    auto backend = manager->allocate_backend();
+    auto external = manager->compile(f);
+    ASSERT_THROW(backend->make_call_frame(external), ngraph_error);
+}