Merge branch 'master' into tfl/gpu_dot_back

8f5b3e2e · Fenglei · GitHub · 9fd64b6f · 95312b8e · 8f5b3e2e
Unverified Commit 8f5b3e2e authored Mar 09, 2018 by Fenglei Committed by GitHub Mar 09, 2018
24 changed files
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -97,6 +97,7 @@ set (SRC
    pass/memory_visualize.cpp
    pass/pass.cpp
    pass/reshape_elimination.cpp
+    pass/result_copy_elimination.cpp
    pass/visualize_tree.cpp
    pattern/matcher.cpp
    pattern/core_fusion.cpp
@@ -186,6 +187,7 @@ if (NGRAPH_CPU_ENABLE AND LLVM_INCLUDE_DIR AND
        runtime/cpu/mkldnn_invoke.cpp
        runtime/cpu/mkldnn_utils.cpp
        runtime/cpu/ops/convert_layout.cpp
+        runtime/cpu/ops/sigmoid.cpp
        runtime/cpu/ops/matmul_bias.cpp
        runtime/cpu/pass/cpu_assignment.cpp
        runtime/cpu/pass/cpu_fusion.cpp

--- a/src/ngraph/ops/result.cpp
+++ b/src/ngraph/ops/result.cpp
@@ -49,5 +49,7 @@ std::shared_ptr<Node> op::Result::copy_with_new_args(const NodeVector& new_args)
        throw ngraph_error("Expected a single-output argument");
    }

-    return std::make_shared<Result>(new_args.at(0));
+    auto res = std::make_shared<Result>(new_args.at(0));
+    res->set_needs_copy(res->needs_copy());
+    return res;
 }
--- a/src/ngraph/ops/result.hpp
+++ b/src/ngraph/ops/result.hpp
@@ -36,12 +36,17 @@ namespace ngraph
                copy_with_new_args(const NodeVector& new_args) const override;

            virtual bool is_output() const override { return true; }
+            void set_needs_copy(bool val) { m_needs_copy = val; }
+            bool needs_copy() const { return m_needs_copy; }
        protected:
            virtual void generate_adjoints(autodiff::Adjoints& adjoints,
                                           const std::shared_ptr<Node>& delta) override
            {
                adjoints.add_delta(get_input_op(0), delta);
            }
+
+        private:
+            bool m_needs_copy{true};
        };
    }
 }
--- a/src/ngraph/pass/result_copy_elimination.cpp
+++ b/src/ngraph/pass/result_copy_elimination.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "result_copy_elimination.hpp"
+#include "ngraph/node.hpp"
+#include "ngraph/ops/parameter.hpp"
+#include "ngraph/ops/result.hpp"
+#include "ngraph/util.hpp"
+
+bool ngraph::pass::ResultCopyElimination::run_on_function(std::shared_ptr<ngraph::Function> f)
+{
+    std::set<std::shared_ptr<Node>> seen;
+    for (auto res : f->get_results())
+    {
+        auto arg = res->get_input_op(0);
+        //we need a copy
+        if (arg->is_parameter() || arg->is_constant())
+        {
+            continue;
+        }
+
+        //TODO: check if broadcast replace op::Result w/ a copy of broadcast node
+
+        //TODO: consider other cases where it's easier to recompute than make a copy
+
+        //we will compute the result directly into output[]
+        if (seen.count(arg) == 0)
+        {
+            res->set_needs_copy(false);
+            seen.insert(arg);
+        }
+    }
+
+    return true;
+}
--- a/src/ngraph/pass/result_copy_elimination.hpp
+++ b/src/ngraph/pass/result_copy_elimination.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include "ngraph/pass/pass.hpp"
+
+namespace ngraph
+{
+    namespace pass
+    {
+        class ResultCopyElimination;
+    }
+}
+
+class ngraph::pass::ResultCopyElimination : public ngraph::pass::FunctionPass
+{
+public:
+    ResultCopyElimination()
+        : FunctionPass()
+    {
+    }
+
+    virtual bool run_on_function(std::shared_ptr<ngraph::Function> f) override;
+};
--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -92,6 +92,7 @@
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
 #include "ngraph/runtime/cpu/ops/convert_layout.hpp"
 #include "ngraph/runtime/cpu/ops/matmul_bias.hpp"
+#include "ngraph/runtime/cpu/ops/sigmoid.hpp"
 #include "ngraph/types/element_type.hpp"
 #include "ngraph/util.hpp"

@@ -272,13 +273,6 @@ namespace ngraph

                const char* cbeta = "0.0f";

-                if (args.size() > 2)
-                {
-                    writer << "memcpy(" << out[0].get_name() << ", " << args[2].get_name() << ", "
-                           << out[0].get_size() * out[0].get_element_type().size() << ");\n";
-                    cbeta = "1.0f";
-                }
-
                writer << "cblas::cblas_sgemm("
                       << "cblas::Layout::RowMajor, " << tranpose_a << tranpose_b << m << ", " << n
                       << ", " << k << ",\n"
@@ -287,6 +281,101 @@ namespace ngraph
                       << "        " << out[0].get_name() << ", " << max(1UL, arg2_shape[1])
                       << ");\n";

+                if (args.size() > 2)
+                {
+                    auto axes = cg->get_broadcast_axes();
+                    if (axes.size() == 1)
+                    {
+                        if (*(axes.begin()) == 0)
+                        {
+                            writer << "static " << out[0].get_element_type().c_type_string()
+                                   << " ones_row[" << arg2_shape[0] << "]"
+                                   << " = { 1.0f";
+                            for (size_t i = 1; i < arg2_shape[0]; ++i)
+                            {
+                                writer << ", 1.0f";
+                            }
+                            writer << "};\n";
+
+                            writer << "cblas::cblas_sgemm("
+                                   << "cblas::Layout::RowMajor, " << cnotranspose << cnotranspose
+                                   << arg2_shape[0] << ", " << arg2_shape[1] << ", 1"
+                                   << ",\n"
+                                   << "        1.0f, ones_row, "
+                                   << "1"
+                                   << ", " << args[2].get_name() << ", " << max(1UL, arg2_shape[1])
+                                   << ", "
+                                   << "1.0f"
+                                   << ",\n"
+                                   << "        " << out[0].get_name() << ", "
+                                   << max(1UL, arg2_shape[1]) << ");\n";
+                        }
+                        else
+                        {
+                            writer << "static " << out[0].get_element_type().c_type_string()
+                                   << " ones_col[" << arg2_shape[1] << "]"
+                                   << " = { 1.0f";
+                            for (size_t i = 1; i < arg2_shape[1]; ++i)
+                            {
+                                writer << ", 1.0f";
+                            }
+                            writer << "};\n";
+
+                            writer << "cblas::cblas_sgemm("
+                                   << "cblas::Layout::RowMajor, " << cnotranspose << ctranspose
+                                   << arg2_shape[0] << ", " << arg2_shape[1] << ", 1"
+                                   << ",\n"
+                                   << "        1.0f, ones_col," << max(1UL, arg2_shape[1]) << ", "
+                                   << args[2].get_name() << ", "
+                                   << "1"
+                                   << ", "
+                                   << "1.0f"
+                                   << ",\n"
+                                   << "        " << out[0].get_name() << ", "
+                                   << max(1UL, arg2_shape[1]) << ");\n";
+                        }
+                    }
+                    else
+                    {
+                        if (axes.size() != 2)
+                        {
+                            throw ngraph_error("unexpected broadcast rank");
+                        }
+
+                        writer << out[0].get_element_type().c_type_string() << " bias["
+                               << arg2_shape[1] << "]"
+                               << " = { " << args[2].get_name() << "[0]";
+                        for (size_t i = 1; i < arg2_shape[1]; ++i)
+                        {
+                            writer << "," << args[2].get_name() << "[0]";
+                        }
+                        writer << "};\n";
+
+                        writer << "static " << out[0].get_element_type().c_type_string()
+                               << " ones_scalar[" << arg2_shape[0] << "]"
+                               << " = { 1.0f";
+                        for (size_t i = 1; i < arg2_shape[0]; ++i)
+                        {
+                            writer << ", 1.0f";
+                        }
+                        writer << "};\n";
+
+                        writer << "cblas::cblas_sgemm("
+                               << "cblas::Layout::RowMajor, " << cnotranspose << cnotranspose
+                               << arg2_shape[0] << ", " << arg2_shape[1] << ", 1"
+                               << ",\n"
+                               << "        1.0f, ones_scalar, "
+                               << "1"
+                               << ", "
+                               << "bias"
+                               << ", " << max(1UL, arg2_shape[1]) << ", "
+                               << "1.0f"
+                               << ",\n"
+                               << "        " << out[0].get_name() << ", " << max(1UL, arg2_shape[1])
+                               << ");\n";
+                    }
+                }
+
                writer.indent--;
                writer << "}\n";
            }
@@ -3081,6 +3170,19 @@ namespace ngraph
                auto output_format =
                    dynamic_cast<runtime::cpu::LayoutDescriptor&>(*output_tvl).get_mkldnn_format();

+                // MKLDNN relies on format names for selecting optimized kernel implementations
+                // Hacky way to deal with this until they move to using canonicalized layouts
+                if (input_format == mkldnn::memory::format::nchw &&
+                    runtime::cpu::mkldnn_utils::is_mkldnn_filter_format(output_format))
+                {
+                    input_format = mkldnn::memory::format::oihw;
+                }
+                if (output_format == mkldnn::memory::format::nchw &&
+                    runtime::cpu::mkldnn_utils::is_mkldnn_filter_format(input_format))
+                {
+                    output_format = mkldnn::memory::format::oihw;
+                }
+
                auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
                auto input_desc = mkldnn_emitter->build_memory_descriptor(args[0], input_format);
                auto result_desc = mkldnn_emitter->build_memory_descriptor(out[0], output_format);
@@ -3217,6 +3319,37 @@ namespace ngraph
                }
            }

+            template <>
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::Sigmoid)
+            {
+                auto input_shape = args[0].get_shape();
+                auto result_shape = out[0].get_shape();
+                int input_1d_size = static_cast<int>(shape_size(input_shape));
+                int result_1d_size = static_cast<int>(shape_size(result_shape));
+
+                auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                auto input_desc = mkldnn::memory::desc(
+                    {input_1d_size},
+                    mkldnn_utils::get_mkldnn_data_type(args[0].get_element_type()),
+                    mkldnn::memory::format::x);
+                auto result_desc = mkldnn::memory::desc(
+                    {result_1d_size},
+                    mkldnn_utils::get_mkldnn_data_type(out[0].get_element_type()),
+                    mkldnn::memory::format::x);
+
+                size_t sigmoid_index =
+                    mkldnn_emitter->build_sigmoid_forward(input_desc, result_desc);
+
+                auto& deps = mkldnn_emitter->get_primitive_deps(sigmoid_index);
+                writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0]) << ", "
+                       << args[0].get_name() << ");\n";
+                writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1]) << ", "
+                       << out[0].get_name() << ");\n";
+
+                writer << "cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, "
+                       << to_string(sigmoid_index) << ");\n";
+            }
+
            template <>
            void CPU_Emitter::EMITTER_DECL(ngraph::op::Softmax)
            {
@@ -3417,6 +3550,13 @@ namespace ngraph
            template <>
            void CPU_Emitter::EMITTER_DECL(ngraph::op::Result)
            {
+                const ngraph::op::Result* result = static_cast<const ngraph::op::Result*>(node);
+
+                if (!result->needs_copy())
+                {
+                    return;
+                }
+
                writer << "kernel::result<" << out[0].get_type() << ">(" << args[0].get_name()
                       << ",\n";
                writer << "               " << out[0].get_name() << ",\n";

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -100,6 +100,7 @@
 #include "ngraph/pass/liveness.hpp"
 #include "ngraph/pass/manager.hpp"
 #include "ngraph/pass/memory_layout.hpp"
+#include "ngraph/pass/result_copy_elimination.hpp"
 #include "ngraph/pattern/core_fusion.hpp"
 #include "ngraph/runtime/cpu/cpu_backend.hpp"
 #include "ngraph/runtime/cpu/cpu_call_frame.hpp"
@@ -110,6 +111,7 @@
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
 #include "ngraph/runtime/cpu/ops/convert_layout.hpp"
 #include "ngraph/runtime/cpu/ops/matmul_bias.hpp"
+#include "ngraph/runtime/cpu/ops/sigmoid.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_assignment.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_fusion.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_layout.hpp"
@@ -243,6 +245,7 @@ static const runtime::cpu::OpMap dispatcher{
    {TI(ngraph::op::Min), &runtime::cpu::CPU_Emitter::emit<op::Min>},
    {TI(ngraph::op::Relu), &runtime::cpu::CPU_Emitter::emit<op::Relu>},
    {TI(ngraph::op::ReluBackprop), &runtime::cpu::CPU_Emitter::emit<op::ReluBackprop>},
+    {TI(ngraph::op::Sigmoid), &runtime::cpu::CPU_Emitter::emit<op::Sigmoid>},
    {TI(ngraph::op::Softmax), &runtime::cpu::CPU_Emitter::emit<op::Softmax>},
 };

@@ -276,10 +279,11 @@ void runtime::cpu::CPU_ExternalFunction::compile()
    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
    pass_manager.register_pass<runtime::cpu::pass::CPUAssignment>(this);
    pass_manager.register_pass<runtime::cpu::pass::CPULayout>(this);
+    pass_manager.register_pass<ngraph::pass::ResultCopyElimination>();
    pass_manager.register_pass<ngraph::pass::Liveness>();
    pass_manager.register_pass<ngraph::pass::MemoryLayout>(s_memory_pool_alignment);
-
    pass_manager.run_passes(m_function);
+
    codegen::CodeWriter writer;

    bool include_mkldnn_headers = false;
@@ -636,6 +640,16 @@ using namespace ngraph::runtime;
            stringstream ss;
            ss << "((" << type << "*)(outputs[" << i << "]))";
            m_variable_name_map[tv->get_tensor().get_name()] = ss.str();
+
+            //it should be safe to assign both descriptors to one output*
+            //since needs_copy == false makes `op::Result` an nop
+            auto res = std::dynamic_pointer_cast<ngraph::op::Result>(op);
+            if (!res->needs_copy())
+            {
+                shared_ptr<descriptor::TensorView> itv =
+                    res->get_input_op(0)->get_output_tensor_view();
+                m_variable_name_map[itv->get_tensor().get_name()] = ss.str();
+            }
        }

        for (shared_ptr<Node> node : current_function->get_ordered_ops())
@@ -827,7 +841,6 @@ using namespace ngraph::runtime;
    }

    // TODO: Cleanup and make this a utility function
-
    file_util::make_directory(s_output_dir);
    string filename = file_util::path_join(s_output_dir, m_function_name + "_codegen.cpp");
    ofstream out(filename);

--- a/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
@@ -280,6 +280,26 @@ size_t MKLDNNEmitter::build_relu_forward(const mkldnn::memory::desc& input_desc,
    return primitive_index;
 }

+size_t MKLDNNEmitter::build_sigmoid_forward(const mkldnn::memory::desc& input_desc,
+                                            const mkldnn::memory::desc& result_desc)
+{
+    size_t input_index = build_memory_primitive(input_desc);
+    size_t result_index = build_memory_primitive(result_desc);
+
+    size_t primitive_index =
+        insert_primitive(new mkldnn::eltwise_forward({{mkldnn::prop_kind::forward_training,
+                                                       mkldnn::algorithm::eltwise_logistic,
+                                                       input_desc,
+                                                       0,
+                                                       0},
+                                                      mkldnn_utils::global_cpu_engine},
+                                                     *m_mkldnn_primitives[input_index],
+                                                     *m_mkldnn_primitives[result_index]));
+
+    m_primitive_deps[primitive_index] = {input_index, result_index};
+    return primitive_index;
+}
+
 size_t MKLDNNEmitter::build_elementwise_add(
    const mkldnn::memory::desc& input0_data_desc,
    const mkldnn::memory::desc& input1_data_desc,

--- a/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
@@ -97,6 +97,9 @@ namespace ngraph
                size_t build_relu_forward(const mkldnn::memory::desc& input_desc,
                                          const mkldnn::memory::desc& result_desc);

+                size_t build_sigmoid_forward(const mkldnn::memory::desc& input_desc,
+                                             const mkldnn::memory::desc& result_desc);
+
                size_t build_elementwise_add(
                    const mkldnn::memory::desc& input0_data_desc,
                    const mkldnn::memory::desc& input1_data_desc,

--- a/src/ngraph/runtime/cpu/mkldnn_utils.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.cpp
@@ -110,6 +110,23 @@ static const std::map<memory::format, const std::string> s_mkldnn_format_string_
    {memory::format::OhIw16o4i, "memory::format::OhIw16o4i"},
 };

+static const std::set<memory::format> s_filter_formats{
+    memory::format::oihw,
+    memory::format::ihwo,
+    memory::format::hwio,
+    // memory::format::oIhw8i,             // These currently map to nChw8c and nChw16c
+    // memory::format::oIhw16i,
+    memory::format::OIhw8i8o,
+    memory::format::OIhw16i16o,
+    memory::format::IOhw16o16i,
+    memory::format::OIhw8o8i,
+    memory::format::OIhw16o16i,
+    memory::format::Oihw8o,
+    memory::format::Oihw16o,
+    memory::format::Ohwi8o,
+    memory::format::Ohwi16o,
+    memory::format::OhIw16o4i};
+
 bool runtime::cpu::mkldnn_utils::IsMKLDNNOp(ngraph::Node& op)
 {
    return (s_op_registry.find(TI(op)) != s_op_registry.end());
@@ -157,16 +174,16 @@ const std::string& runtime::cpu::mkldnn_utils::get_mkldnn_format_string(memory::
 }

 mkldnn::memory::format runtime::cpu::mkldnn_utils::get_input_mkldnn_format(const Node* node,
-                                                                           int index)
+                                                                           size_t index)
 {
    auto tvl = node->get_inputs()[index].get_output().get_tensor_view()->get_tensor_view_layout();
    return dynamic_cast<runtime::cpu::LayoutDescriptor&>(*tvl).get_mkldnn_format();
 }

 mkldnn::memory::format runtime::cpu::mkldnn_utils::get_output_mkldnn_format(const Node* node,
-                                                                            int index)
+                                                                            size_t index)
 {
-    auto tvl = node->get_output_tensor_view(0)->get_tensor_view_layout();
+    auto tvl = node->get_output_tensor_view(index)->get_tensor_view_layout();
    return dynamic_cast<runtime::cpu::LayoutDescriptor&>(*tvl).get_mkldnn_format();
 }

@@ -181,8 +198,8 @@ bool runtime::cpu::mkldnn_utils::use_mkldnn_kernel(const ngraph::Node* node)
 bool runtime::cpu::mkldnn_utils::compare_mkldnn_formats(mkldnn::memory::format fmt1,
                                                        mkldnn::memory::format fmt2)
 {
-    set<mkldnn::memory::format> similar_4d_formats{mkldnn::memory::format::nchw,
-                                                   mkldnn::memory::format::oihw};
+    std::set<mkldnn::memory::format> similar_4d_formats{mkldnn::memory::format::nchw,
+                                                        mkldnn::memory::format::oihw};
    if ((fmt1 == fmt2) || (similar_4d_formats.find(fmt1) != similar_4d_formats.end() &&
                           similar_4d_formats.find(fmt2) != similar_4d_formats.end()))
    {
@@ -190,3 +207,12 @@ bool runtime::cpu::mkldnn_utils::compare_mkldnn_formats(mkldnn::memory::format f
    }
    return false;
 }
+
+bool runtime::cpu::mkldnn_utils::is_mkldnn_filter_format(mkldnn::memory::format fmt)
+{
+    if (s_filter_formats.find(fmt) != s_filter_formats.end())
+    {
+        return true;
+    }
+    return false;
+}
--- a/src/ngraph/runtime/cpu/mkldnn_utils.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.hpp
@@ -39,11 +39,12 @@ namespace ngraph
                mkldnn::memory::data_type get_mkldnn_data_type(const ngraph::element::Type& type);
                const std::string& get_mkldnn_format_string(mkldnn::memory::format fmt);

-                mkldnn::memory::format get_input_mkldnn_format(const Node* node, int index);
-                mkldnn::memory::format get_output_mkldnn_format(const Node* node, int index);
+                mkldnn::memory::format get_input_mkldnn_format(const Node* node, size_t index);
+                mkldnn::memory::format get_output_mkldnn_format(const Node* node, size_t index);
                bool use_mkldnn_kernel(const ngraph::Node* node);
                bool compare_mkldnn_formats(mkldnn::memory::format fmt1,
                                            mkldnn::memory::format fmt2);
+                bool is_mkldnn_filter_format(mkldnn::memory::format fmt);
            }
        }
    }

--- a/src/ngraph/runtime/cpu/ops/matmul_bias.cpp
+++ b/src/ngraph/runtime/cpu/ops/matmul_bias.cpp
@@ -32,7 +32,8 @@ std::shared_ptr<ngraph::Node>
                                        m_shape_w,
                                        m_shape_x,
                                        m_transpose_w,
-                                        m_transpose_x);
+                                        m_transpose_x,
+                                        m_broadcast_axes);
 }

 ngraph::op::MatmulBias::MatmulBias(std::shared_ptr<ngraph::Node> W,
@@ -41,7 +42,8 @@ ngraph::op::MatmulBias::MatmulBias(std::shared_ptr<ngraph::Node> W,
                                   Shape shape_w,
                                   Shape shape_x,
                                   bool transpose_w,
-                                   bool transpose_x)
+                                   bool transpose_x,
+                                   AxisSet axes)
    : RequiresTensorViewArgs("MatMulBias",
                             b == nullptr ? std::vector<std::shared_ptr<Node>>{W, x}
                                          : std::vector<std::shared_ptr<Node>>{W, x, b})
@@ -49,8 +51,24 @@ ngraph::op::MatmulBias::MatmulBias(std::shared_ptr<ngraph::Node> W,
    , m_shape_x(shape_x)
    , m_transpose_w(transpose_w)
    , m_transpose_x(transpose_x)
+    , m_broadcast_axes(axes)

 {
+    if (axes.size() == 0 && b != nullptr)
+    {
+        throw ngraph_error("Bias but no broadcast axes");
+    }
+
+    if (b == nullptr && axes.size() != 0)
+    {
+        throw ngraph_error("Broadcast axes but no bias");
+    }
+
+    if (axes.size() > 2)
+    {
+        throw ngraph_error("Broadcasting to > 2D tensor");
+    }
+
    if (shape_w.size() != 2)
    {
        NGRAPH_DEBUG << "W shape = " << vector_to_string(shape_w);

--- a/src/ngraph/runtime/cpu/ops/matmul_bias.hpp
+++ b/src/ngraph/runtime/cpu/ops/matmul_bias.hpp
@@ -16,6 +16,7 @@

 #pragma once

+#include "ngraph/axis_set.hpp"
 #include "ngraph/ops/util/requires_tensor_view_args.hpp"

 namespace ngraph
@@ -31,12 +32,14 @@ namespace ngraph
                       Shape shape_w,
                       Shape shape_x,
                       bool transpose_w,
-                       bool transpose_x);
+                       bool transpose_x,
+                       AxisSet axes = AxisSet{});

            bool get_is_arg0_transposed() const { return m_transpose_w; }
            bool get_is_arg1_transposed() const { return m_transpose_x; }
            Shape get_arg0_shape() const { return m_shape_w; }
            Shape get_arg1_shape() const { return m_shape_x; }
+            const AxisSet& get_broadcast_axes() const { return m_broadcast_axes; }
            virtual std::shared_ptr<Node>
                copy_with_new_args(const NodeVector& new_args) const override;

@@ -45,6 +48,7 @@ namespace ngraph
            Shape m_shape_x;
            bool m_transpose_w;
            bool m_transpose_x;
+            AxisSet m_broadcast_axes;
        };
    }
 }
--- a/src/ngraph/runtime/cpu/ops/sigmoid.cpp
+++ b/src/ngraph/runtime/cpu/ops/sigmoid.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "ngraph/runtime/cpu/ops/sigmoid.hpp"
+#include "ngraph/log.hpp"
+#include "ngraph/util.hpp"
+
+std::shared_ptr<ngraph::Node>
+    ngraph::op::Sigmoid::copy_with_new_args(const NodeVector& new_args) const
+{
+    if (new_args.size() != 1)
+    {
+        throw ngraph_error("Incorrect number of new arguments");
+    }
+
+    return std::make_shared<Sigmoid>(new_args.at(0));
+}
+
+ngraph::op::Sigmoid::Sigmoid(std::shared_ptr<ngraph::Node> input)
+    : RequiresTensorViewArgs("Sigmoid", {input})
+    , m_shape_input(input->get_shape())
+{
+    add_output(input->get_element_type(), m_shape_input);
+}
--- a/src/ngraph/runtime/cpu/ops/sigmoid.hpp
+++ b/src/ngraph/runtime/cpu/ops/sigmoid.hpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include "ngraph/ops/util/requires_tensor_view_args.hpp"
+
+namespace ngraph
+{
+    namespace op
+    {
+        class Sigmoid : public util::RequiresTensorViewArgs
+        {
+        public:
+            Sigmoid(std::shared_ptr<Node> input);
+            Shape get_input_shape() const { return m_shape_input; }
+            virtual std::shared_ptr<Node>
+                copy_with_new_args(const NodeVector& new_args) const override;
+
+        private:
+            Shape m_shape_input;
+        };
+    }
+}
--- a/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
@@ -27,10 +27,12 @@
 #include "ngraph/descriptor/output.hpp"
 #include "ngraph/ops/add.hpp"
 #include "ngraph/ops/avg_pool.hpp"
+#include "ngraph/ops/batch_norm.hpp"
 #include "ngraph/ops/convolution.hpp"
 #include "ngraph/ops/relu.hpp"
 #include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+#include "ngraph/runtime/cpu/ops/sigmoid.hpp"

 using namespace std;
 using namespace ngraph;
@@ -208,6 +210,19 @@ namespace ngraph
                    }
                }

+                template <>
+                void CPUAssignment::ASSIGN_DECL(ngraph::op::Sigmoid)
+                {
+                    auto sigmoid = static_cast<op::Sigmoid*>(node);
+                    if (node->get_input_element_type(0) == element::f32)
+                    {
+                        auto op_annotations =
+                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                        op_annotations->set_mkldnn_op(true);
+                        sigmoid->set_op_annotations(op_annotations);
+                    }
+                }
+
                template <>
                void CPUAssignment::ASSIGN_DECL(ngraph::op::ReluBackprop)
                {
@@ -225,6 +240,16 @@ namespace ngraph
                        avg_pool->set_op_annotations(op_annotations);
                    }
                }
+
+                template <>
+                void CPUAssignment::ASSIGN_DECL(ngraph::op::BatchNorm)
+                {
+                    auto batchnorm = static_cast<op::BatchNorm*>(node);
+                    auto op_annotations =
+                        std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                    op_annotations->set_mkldnn_op(true);
+                    batchnorm->set_op_annotations(op_annotations);
+                }
            }
        }
    }
@@ -234,6 +259,7 @@ namespace ngraph

 static const runtime::cpu::pass::AssignOpMap s_dispatcher{
    {TI(ngraph::op::Add), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Add>},
+    {TI(ngraph::op::BatchNorm), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::BatchNorm>},
    {TI(ngraph::op::Convolution),
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Convolution>},
    {TI(ngraph::op::ConvolutionBackpropData),
@@ -246,6 +272,7 @@ static const runtime::cpu::pass::AssignOpMap s_dispatcher{
    {TI(ngraph::op::Relu), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Relu>},
    {TI(ngraph::op::ReluBackprop),
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ReluBackprop>},
+    {TI(ngraph::op::Sigmoid), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Sigmoid>},
 };

 bool runtime::cpu::pass::CPUAssignment::run_on_call_graph(

--- a/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
@@ -30,8 +30,10 @@
 #include "ngraph/ops/convolution.hpp"
 #include "ngraph/ops/divide.hpp"
 #include "ngraph/ops/dot.hpp"
+#include "ngraph/ops/exp.hpp"
 #include "ngraph/ops/get_output_element.hpp"
 #include "ngraph/ops/multiply.hpp"
+#include "ngraph/ops/negative.hpp"
 #include "ngraph/ops/pad.hpp"
 #include "ngraph/ops/parameter.hpp"
 #include "ngraph/ops/reshape.hpp"
@@ -42,6 +44,7 @@
 #include "ngraph/pattern/op/any.hpp"
 #include "ngraph/pattern/op/label.hpp"
 #include "ngraph/runtime/cpu/ops/matmul_bias.hpp"
+#include "ngraph/runtime/cpu/ops/sigmoid.hpp"

 static bool init_cblas_arg(std::shared_ptr<ngraph::Node> reshape,
                           std::shared_ptr<ngraph::Node> arg,
@@ -134,12 +137,21 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_matmulbias_pattern()
                     << m.match_root()->get_name();

        auto mpattern = m.match_root(); //add
-        auto m_matmul = mpattern->get_input_op(0);
-        auto m_broadcast = mpattern->get_input_op(1);
+        auto m_matmul = std::dynamic_pointer_cast<op::MatmulBias>(mpattern->get_input_op(0));
+        auto m_broadcast = std::dynamic_pointer_cast<op::Broadcast>(mpattern->get_input_op(1));
+        auto m_bias = m_broadcast->get_input_op(0);
        auto pattern_map = m.get_pattern_map();

-        return m_matmul->copy_with_new_args(
-            NodeVector{pattern_map[W], pattern_map[x], m_broadcast});
+        auto mmb = std::make_shared<op::MatmulBias>(pattern_map[W],
+                                                    pattern_map[x],
+                                                    m_bias,
+                                                    m_matmul->get_arg0_shape(),
+                                                    m_matmul->get_arg1_shape(),
+                                                    m_matmul->get_is_arg0_transposed(),
+                                                    m_matmul->get_is_arg1_transposed(),
+                                                    m_broadcast->get_broadcast_axes());
+
+        return mmb;
    };

    auto m = std::make_shared<ngraph::pattern::Matcher>(padd, callback);
@@ -512,3 +524,45 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_zero_padded_conv()

    this->add_matcher(std::make_shared<ngraph::pattern::Matcher>(conv_label, callback));
 }
+
+void ngraph::runtime::cpu::pass::CPUFusion::construct_sigmoid()
+{
+    //construct variance
+    auto input = std::make_shared<pattern::op::Label>(element::f32, Shape{3, 4});
+    auto neg_input = std::make_shared<op::Negative>(input);
+    auto exp_neg_input = std::make_shared<op::Exp>(neg_input);
+
+    // broadcast input
+    auto constant = std::make_shared<pattern::op::Label>(element::f32, Shape{});
+    auto broadcast_constant = std::make_shared<op::Broadcast>(constant, Shape{3, 4}, AxisSet{0, 1});
+
+    auto add_exp = std::make_shared<op::Add>(exp_neg_input, broadcast_constant);
+    auto divide_1_over_exp = std::make_shared<op::Divide>(broadcast_constant, add_exp);
+
+    //Define a call back that needs to called once the DFG matches the pattern
+    ngraph::pattern::gr_callback_fn callback =
+        [input](pattern::Matcher& m) -> std::shared_ptr<Node> {
+        NGRAPH_DEBUG << "In a callback for construct_fprop_sigmoid pattern against "
+                     << m.match_root()->get_name();
+        auto pattern_map = m.get_pattern_map();
+
+        if (m.match_root()->get_element_type() != element::f32)
+        {
+            NGRAPH_DEBUG << "mpattern = " << m.match_root()->get_name() << " type is not float!";
+            return nullptr;
+        }
+
+        if (m.match_root()->get_outputs().size() != pattern_map[input]->get_outputs().size())
+        {
+            NGRAPH_DEBUG << "mpattern = " << m.match_root()->get_name()
+                         << "input= " << pattern_map[input]->get_name() << "size dont match!";
+            return nullptr;
+        }
+
+        auto sigmoid_node = std::make_shared<op::Sigmoid>(pattern_map[input]);
+        return sigmoid_node;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(divide_1_over_exp, callback);
+    this->add_matcher(m);
+}
--- a/src/ngraph/runtime/cpu/pass/cpu_fusion.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_fusion.hpp
@@ -43,12 +43,14 @@ public:
        construct_fprop_bn();
        construct_zero_padded_reshaped_conv();
        construct_zero_padded_conv();
+        construct_sigmoid();
    }

 private:
    void construct_matmul_pattern();
    void construct_matmulbias_pattern();
    void construct_fprop_bn();
+    void construct_sigmoid();
    void construct_zero_padded_reshaped_conv();
    void construct_zero_padded_conv();
 };
--- a/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
@@ -28,7 +28,9 @@
 #include "ngraph/log.hpp"
 #include "ngraph/ops/add.hpp"
 #include "ngraph/ops/avg_pool.hpp"
+#include "ngraph/ops/batch_norm.hpp"
 #include "ngraph/ops/convolution.hpp"
+#include "ngraph/ops/get_output_element.hpp"
 #include "ngraph/ops/op.hpp"
 #include "ngraph/ops/relu.hpp"
 #include "ngraph/ops/result.hpp"
@@ -36,6 +38,7 @@
 #include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
 #include "ngraph/runtime/cpu/ops/convert_layout.hpp"
+#include "ngraph/runtime/cpu/ops/sigmoid.hpp"

 using namespace std;
 using namespace mkldnn;
@@ -640,6 +643,17 @@ namespace ngraph
                    set_output_layouts(node, prim_output_formats);
                }

+                template <>
+                void CPULayout::LAYOUT_DECL(ngraph::op::GetOutputElement)
+                {
+                    auto goe = static_cast<const ngraph::op::GetOutputElement*>(node.get());
+                    auto input_layout = runtime::cpu::mkldnn_utils::get_input_mkldnn_format(
+                        node.get(), goe->get_n());
+                    vector<memory::format> prim_output_formats;
+                    prim_output_formats.push_back(input_layout);
+                    set_output_layouts(node, prim_output_formats);
+                }
+
                template <>
                void CPULayout::LAYOUT_DECL(ngraph::op::Relu)
                {
@@ -657,6 +671,23 @@ namespace ngraph
                    }
                }

+                template <>
+                void CPULayout::LAYOUT_DECL(ngraph::op::Sigmoid)
+                {
+                    if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node.get()))
+                    {
+                        auto input_layout =
+                            runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node.get(), 0);
+                        vector<memory::format> prim_output_formats;
+                        prim_output_formats.push_back(input_layout);
+                        set_output_layouts(node, prim_output_formats);
+                    }
+                    else
+                    {
+                        set_default_layouts(external_function, node);
+                    }
+                }
+
                template <>
                void CPULayout::LAYOUT_DECL(ngraph::op::ReluBackprop)
                {
@@ -680,6 +711,32 @@ namespace ngraph
                    }
                }

+                template <>
+                void CPULayout::LAYOUT_DECL(ngraph::op::BatchNorm)
+                {
+                    if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node.get()))
+                    {
+                        auto input_layout =
+                            runtime::cpu::mkldnn_utils::get_input_mkldnn_format(node.get(), 2);
+
+                        vector<memory::format> prim_input_formats;
+                        vector<memory::format> prim_output_formats;
+                        prim_input_formats.push_back(memory::format::x);
+                        prim_input_formats.push_back(memory::format::x);
+                        prim_input_formats.push_back(input_layout);
+                        prim_output_formats.push_back(input_layout);
+                        prim_output_formats.push_back(memory::format::x);
+                        prim_output_formats.push_back(memory::format::x);
+                        node =
+                            insert_input_conversions(external_function, node, prim_input_formats);
+                        set_output_layouts(node, prim_output_formats);
+                    }
+                    else
+                    {
+                        throw ngraph_error("Batchnorm only supported in MKLDNN for now");
+                    }
+                }
+
                template <>
                void CPULayout::LAYOUT_DECL(ngraph::op::Add)
                {
@@ -719,10 +776,14 @@ static const runtime::cpu::pass::LayoutOpMap s_dispatcher{
    {TI(ngraph::op::AvgPool), &runtime::cpu::pass::CPULayout::layout<ngraph::op::AvgPool>},
    {TI(ngraph::op::AvgPoolBackprop),
     &runtime::cpu::pass::CPULayout::layout<ngraph::op::AvgPoolBackprop>},
+    {TI(ngraph::op::BatchNorm), &runtime::cpu::pass::CPULayout::layout<ngraph::op::BatchNorm>},
+    {TI(ngraph::op::GetOutputElement),
+     &runtime::cpu::pass::CPULayout::layout<ngraph::op::GetOutputElement>},
    {TI(ngraph::op::Relu), &runtime::cpu::pass::CPULayout::layout<ngraph::op::Relu>},
    {TI(ngraph::op::Result), &runtime::cpu::pass::CPULayout::layout<ngraph::op::Result>},
    {TI(ngraph::op::ReluBackprop),
     &runtime::cpu::pass::CPULayout::layout<ngraph::op::ReluBackprop>},
+    {TI(ngraph::op::Sigmoid), &runtime::cpu::pass::CPULayout::layout<ngraph::op::Sigmoid>},
 };

 bool runtime::cpu::pass::CPULayout::run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes)

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -288,7 +288,6 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
                    writer << "int n = " << arg1_shape[1] << ";\n";
                    writer << "int k = " << arg0_shape[0] << ";\n";
                    writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_HOST);\n";
-
                    writer << "cublasSgemm("
                           << "cublas_handle,"
                           << "CUBLAS_OP_N,"
@@ -303,7 +302,6 @@ cudnnSetOpTensorDescriptor(opTensorDesc,
                           << "&beta," // beta
                           << out[0].get_name() << ","
                           << "n);\n";
-
                    writer << "cublasSetPointerMode(cublas_handle, CUBLAS_POINTER_MODE_DEVICE);\n";
                    writer.indent--;
                    writer << "}\n";

--- a/src/ngraph/runtime/gpu/gpu_external_function.cpp
+++ b/src/ngraph/runtime/gpu/gpu_external_function.cpp
@@ -758,7 +758,6 @@ using namespace std;
                            }
                        }
                    }
-
                    writer.indent--;
                    // End generated function
                    writer += "}\n\n";

--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp
@@ -37,6 +37,7 @@
 #include "ngraph/pass/reshape_elimination.hpp"
 #include "ngraph/pass/visualize_tree.hpp"
 #include "ngraph/runtime/cpu/ops/matmul_bias.hpp"
+#include "ngraph/runtime/cpu/ops/sigmoid.hpp"
 #include "ngraph/runtime/cpu/pass/cpu_fusion.hpp"
 #include "ngraph/serializer.hpp"
 #include "ngraph/util.hpp"
@@ -91,11 +92,89 @@ TEST(cpu_fusion, gemm_pattern)
    ASSERT_EQ(n.get_pattern_map()[x], B);
    ASSERT_EQ(n.get_pattern_map()[b], C);

-    auto cg =
-        make_shared<op::MatmulBias>(W, x, broadcast, W->get_shape(), x->get_shape(), false, false);
+    auto cg = make_shared<op::MatmulBias>(
+        W, x, C, W->get_shape(), x->get_shape(), false, false, AxisSet{0});
+}
+
+TEST(cpu_fusion, gemm_cpu_broadcast_row)
+{
+    Shape shapeA{3, 2};
+    Shape shapeB{2, 3};
+    Shape shapeC{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shapeA);
+    auto B = make_shared<op::Parameter>(element::f32, shapeB);
+
+    auto reshape_w = make_shared<op::Reshape>(A, AxisVector{1, 0}, Shape{2, 3});
+    auto reshape_x = make_shared<op::Reshape>(B, AxisVector{1, 0}, Shape{3, 2});
+
+    auto one = op::Constant::create<float>(element::f32, Shape{2}, std::vector<float>{1.0f, 1.0f});
+
+    auto broadcast = make_shared<op::Broadcast>(one, shapeC, AxisSet{0});
+    auto cg = make_shared<op::MatmulBias>(
+        A, B, one, A->get_shape(), B->get_shape(), true, true, AxisSet{0});
+
+    auto f = make_shared<Function>(cg, op::ParameterVector{A, B});
+
+    auto manager = runtime::Manager::get("CPU");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+
+    shared_ptr<runtime::TensorView> a = backend->make_primary_tensor_view(element::f32, shapeA);
+    shared_ptr<runtime::TensorView> b = backend->make_primary_tensor_view(element::f32, shapeB);
+    shared_ptr<runtime::TensorView> result =
+        backend->make_primary_tensor_view(element::f32, shapeC);
+
+    vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f, 1.0f, 4.0f};
+    vector<float> dataB{3.0f, 3.0f, 3.0f, 9.0f, 9.0f, 9.0f};
+    copy_data(a, dataA);
+    copy_data(b, dataB);
+
+    cf->call({a, b}, {result});
+    vector<float> expected{10, 28, 37, 109};
+    ASSERT_TRUE(read_vector<float>(result) == expected);
+}
+
+TEST(cpu_fusion, gemm_cpu_broadcast_column)
+{
+    Shape shapeA{3, 2};
+    Shape shapeB{2, 3};
+    Shape shapeC{2, 2};
+    auto A = make_shared<op::Parameter>(element::f32, shapeA);
+    auto B = make_shared<op::Parameter>(element::f32, shapeB);
+
+    auto reshape_w = make_shared<op::Reshape>(A, AxisVector{1, 0}, Shape{2, 3});
+    auto reshape_x = make_shared<op::Reshape>(B, AxisVector{1, 0}, Shape{3, 2});
+
+    auto one = op::Constant::create<float>(element::f32, Shape{2}, std::vector<float>{1.0f, 1.0f});
+
+    auto broadcast = make_shared<op::Broadcast>(one, shapeC, AxisSet{1});
+    auto cg = make_shared<op::MatmulBias>(
+        A, B, one, A->get_shape(), B->get_shape(), true, true, AxisSet{1});
+
+    auto f = make_shared<Function>(cg, op::ParameterVector{A, B});
+
+    auto manager = runtime::Manager::get("CPU");
+    auto external = manager->compile(f);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+
+    shared_ptr<runtime::TensorView> a = backend->make_primary_tensor_view(element::f32, shapeA);
+    shared_ptr<runtime::TensorView> b = backend->make_primary_tensor_view(element::f32, shapeB);
+    shared_ptr<runtime::TensorView> result =
+        backend->make_primary_tensor_view(element::f32, shapeC);
+
+    vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f, 1.0f, 4.0f};
+    vector<float> dataB{3.0f, 3.0f, 3.0f, 9.0f, 9.0f, 9.0f};
+    copy_data(a, dataA);
+    copy_data(b, dataB);
+
+    cf->call({a, b}, {result});
+    vector<float> expected{10, 28, 37, 109};
+    ASSERT_TRUE(read_vector<float>(result) == expected);
 }

-TEST(cpu_fusion, gemm_cpu)
+TEST(cpu_fusion, gemm_cpu_broadcast_matrix)
 {
    Shape shapeA{3, 2};
    Shape shapeB{2, 3};
@@ -109,8 +188,8 @@ TEST(cpu_fusion, gemm_cpu)
    auto one = op::Constant::create<float>(element::f32, Shape{}, std::vector<float>{1.0f});

    auto broadcast = make_shared<op::Broadcast>(one, shapeC, AxisSet{0, 1});
-    auto cg =
-        make_shared<op::MatmulBias>(A, B, broadcast, A->get_shape(), B->get_shape(), true, true);
+    auto cg = make_shared<op::MatmulBias>(
+        A, B, one, A->get_shape(), B->get_shape(), true, true, AxisSet{0, 1});

    auto f = make_shared<Function>(cg, op::ParameterVector{A, B});

@@ -212,7 +291,7 @@ TEST(cpu_fusion, cpu_fusion_pass_matmul_bias)
    pass_manager.run_passes(func);
    auto gmm = graph->get_input_op(0);
    ASSERT_TRUE(std::dynamic_pointer_cast<op::MatmulBias>(gmm));
-    ASSERT_EQ(gmm->get_input_op(2), broadcast);
+    ASSERT_EQ(gmm->get_input_op(2), b);
 }

 TEST(cpu_fusion, cpu_fusion_pass_matmul_no_bias)
@@ -593,3 +672,64 @@ TEST(cpu_fusion, non_zero_padded_conv)

    ASSERT_EQ(count_ops_of_type<op::Pad>(func), 1);
 }
+
+TEST(cpu_fusion, sigmoid_fprop_fusion)
+{
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
+    const string json_path = file_util::path_join(SERIALIZED_ZOO, "mxnet/Graph_fprop_sigmoid.json");
+    const string json_string = file_util::read_file_to_string(json_path);
+    stringstream ss(json_string);
+    shared_ptr<Function> func = ngraph::deserialize(ss);
+    pass_manager.run_passes(func);
+    size_t ccg = count_ops_of_type<op::Sigmoid>(func);
+    ASSERT_EQ(ccg, 1);
+}
+
+TEST(cpu_fusion, sigmoid_n1c1h2w2)
+{
+    auto input = make_shared<op::Parameter>(element::f32, Shape{1, 1, 2, 2});
+    auto sigmoid_node = make_shared<op::Sigmoid>(input);
+    auto func = make_shared<Function>(sigmoid_node, op::ParameterVector{input});
+
+    auto manager = runtime::Manager::get("CPU");
+    auto external = manager->compile(func);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+
+    shared_ptr<runtime::TensorView> a =
+        backend->make_primary_tensor_view(element::f32, input->get_shape());
+    shared_ptr<runtime::TensorView> result =
+        backend->make_primary_tensor_view(element::f32, input->get_shape());
+
+    vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f};
+    copy_data(a, dataA);
+
+    cf->call({a}, {result});
+    vector<float> expected{0.73105858f, 0.98201379f, 0.73105858f, 0.98201379f};
+    ASSERT_TRUE(read_vector<float>(result) == expected);
+}
+
+TEST(cpu_fusion, sigmoid_n1c1h4)
+{
+    auto input = make_shared<op::Parameter>(element::f32, Shape{1, 1, 4});
+    auto sigmoid_node = make_shared<op::Sigmoid>(input);
+    auto func = make_shared<Function>(sigmoid_node, op::ParameterVector{input});
+
+    auto manager = runtime::Manager::get("CPU");
+    auto external = manager->compile(func);
+    auto backend = manager->allocate_backend();
+    auto cf = backend->make_call_frame(external);
+
+    shared_ptr<runtime::TensorView> a =
+        backend->make_primary_tensor_view(element::f32, input->get_shape());
+    shared_ptr<runtime::TensorView> result =
+        backend->make_primary_tensor_view(element::f32, input->get_shape());
+
+    vector<float> dataA{1.0f, 4.0f, 1.0f, 4.0f};
+    copy_data(a, dataA);
+
+    cf->call({a}, {result});
+    vector<float> expected{0.73105858f, 0.98201379f, 0.73105858f, 0.98201379f};
+    ASSERT_TRUE(read_vector<float>(result) == expected);
+}
--- a/test/models/mxnet/Graph_fprop_sigmoid.json
+++ b/test/models/mxnet/Graph_fprop_sigmoid.json
+[{"name":"Function_0","ops":[{"element_type":"float","inputs":[],"name":"Parameter_0","op":"Parameter","outputs":["Parameter_0_0"],"shape":[3,4]},{"element_type":"float","inputs":[],"name":"Constant_1","op":"Constant","outputs":["Constant_1_0"],"shape":[],"value":["1"]},{"inputs":["Parameter_0"],"name":"Negative_3","op":"Negative","outputs":["Negative_3_0"]},{"axes":[0,1],"inputs":["Constant_1"],"name":"Broadcast_2","op":"Broadcast","outputs":["Broadcast_2_0"],"shape":[3,4]},{"inputs":["Negative_3"],"name":"Exp_4","op":"Exp","outputs":["Exp_4_0"]},{"inputs":["Broadcast_2","Exp_4"],"name":"Add_5","op":"Add","outputs":["Add_5_0"]},{"inputs":["Broadcast_2","Add_5"],"name":"Divide_6","op":"Divide","outputs":["Divide_6_0"]}],"parameters":["Parameter_0"],"result":["Divide_6"]}]
--- a/test/type_prop.cpp
+++ b/test/type_prop.cpp
@@ -17,6 +17,7 @@
 #include "gtest/gtest.h"

 #include "ngraph/ngraph.hpp"
+#include "ngraph/ops/batch_norm.hpp"

 #include <memory>
 using namespace std;
@@ -57,6 +58,96 @@ TEST(type_prop, broadcast_deduce_incorrect)
    }
 }

+TEST(type_prop, batchnorm_backprop_4d_check)
+{
+    auto dummy = make_shared<op::Parameter>(element::f32, Shape{});
+    auto param = make_shared<op::Parameter>(element::f32, Shape{2, 4});
+
+    try
+    {
+        auto bc =
+            make_shared<op::BatchNormBackprop>(0.001, dummy, dummy, param, dummy, dummy, dummy);
+        FAIL() << "Deduced type should disagree with c-tor arguments";
+    }
+    catch (const ngraph_error& error)
+    {
+        EXPECT_EQ(error.what(), std::string("Input expected to be a 4D tensor"));
+    }
+    catch (...)
+    {
+        FAIL() << "Deduced type check failed for unexpected reason";
+    }
+}
+
+TEST(type_prop, batchnorm_backprop_et_check)
+{
+    auto dummy_f32 = make_shared<op::Parameter>(element::f32, Shape{3});
+    auto dummy_f64 = make_shared<op::Parameter>(element::f64, Shape{3});
+    auto param = make_shared<op::Parameter>(element::f32, Shape{4, 3, 2, 2});
+
+    try
+    {
+        auto bc = make_shared<op::BatchNormBackprop>(
+            0.001, dummy_f32, dummy_f64, param, dummy_f32, dummy_f32, dummy_f32);
+        FAIL() << "Deduced type should disagree with c-tor arguments";
+    }
+    catch (const ngraph_error& error)
+    {
+        EXPECT_EQ(error.what(),
+                  std::string("The element type of beta isn't equal to input data's type"));
+    }
+    catch (...)
+    {
+        FAIL() << "Deduced type check failed for unexpected reason";
+    }
+}
+
+TEST(type_prop, batchnorm_backprop_shape_check)
+{
+    auto dummy = make_shared<op::Parameter>(element::f32, Shape{3});
+    auto dummy2 = make_shared<op::Parameter>(element::f32, Shape{4});
+    auto param = make_shared<op::Parameter>(element::f32, Shape{4, 3, 2, 2});
+
+    try
+    {
+        auto bc =
+            make_shared<op::BatchNormBackprop>(0.001, dummy, dummy2, param, dummy2, dummy2, dummy2);
+        FAIL() << "Deduced type should disagree with c-tor arguments";
+    }
+    catch (const ngraph_error& error)
+    {
+        EXPECT_EQ(error.what(),
+                  std::string("The shape of beta isn't equal to input channel's shape"));
+    }
+    catch (...)
+    {
+        FAIL() << "Deduced type check failed for unexpected reason";
+    }
+}
+
+TEST(type_prop, batchnorm_backprop_delta_check)
+{
+    auto dummy = make_shared<op::Parameter>(element::f32, Shape{3});
+    auto dummy2 = make_shared<op::Parameter>(element::f32, Shape{4});
+    auto param = make_shared<op::Parameter>(element::f32, Shape{4, 3, 2, 2});
+    auto delta = make_shared<op::Parameter>(element::f32, Shape{4, 3, 2, 3});
+
+    try
+    {
+        auto bc =
+            make_shared<op::BatchNormBackprop>(0.001, dummy, dummy, param, dummy, dummy, delta);
+        FAIL() << "Deduced type should disagree with c-tor arguments";
+    }
+    catch (const ngraph_error& error)
+    {
+        EXPECT_EQ(error.what(), std::string("delta shape is expected to be equal to input shape"));
+    }
+    catch (...)
+    {
+        FAIL() << "Deduced type check failed for unexpected reason";
+    }
+}
+
 TEST(type_prop, concat_deduce)
 {
    // Deduce type