Gauri/groupconv batchnorm (#1900)

* Initial implementation of GroupConv+BatchNorm fusion * Added GroupConv+BatchNorm with Relu fusion * Added changes to fuse with BoundedRelu * Changed BoundedRelu to Relu * Added test; Code cleanup * Code formatting * Removed dead code * Added test cases and other misc * Bug fix in group conv callback and general cleanup * Address PR feedback * Minor edit to comment. MKLDNN divides both input and output channels by groups * Style fixes and PR feedback

Gauri/groupconv batchnorm (#1900)
* Initial implementation of GroupConv+BatchNorm fusion * Added GroupConv+BatchNorm with Relu fusion * Added changes to fuse with BoundedRelu * Changed BoundedRelu to Relu * Added test; Code cleanup * Code formatting * Removed dead code * Added test cases and other misc * Bug fix in group conv callback and general cleanup * Address PR feedback * Minor edit to comment. MKLDNN divides both input and output channels by groups * Style fixes and PR feedback
c637d629 · gaurides · Robert Kimball · 2a49f1c8 · c637d629 · c637d629
Commit c637d629 authored Oct 30, 2018 by gaurides Committed by Robert Kimball Oct 30, 2018
15 changed files
--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -82,6 +82,7 @@ set(SRC
    op/batch_norm_relu.cpp
    op/bounded_relu.cpp
    op/group_conv.cpp
+    op/group_conv_bias.cpp
    op/halide_op.cpp
    op/conv_bias.cpp
    op/conv_relu.cpp

--- a/src/ngraph/runtime/cpu/builder/convert_layout.cpp
+++ b/src/ngraph/runtime/cpu/builder/convert_layout.cpp
@@ -19,6 +19,7 @@
 #include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
 #include "ngraph/runtime/cpu/op/group_conv.hpp"
+#include "ngraph/runtime/cpu/op/group_conv_bias.hpp"

 using namespace std;
 using namespace ngraph;
@@ -51,17 +52,27 @@ namespace ngraph
                else if (input_desc.data.format == mkldnn_nchw && input_desc.data.ndims == 4 &&
                         result_desc.data.ndims == 5 && node->get_users().size() == 1)
                {
-                    auto gconv = std::dynamic_pointer_cast<ngraph::op::GroupConvolution>(
-                        *(begin(node->get_users())));
-                    if (gconv)
+                    Shape weights_shape_groups;
+                    if (auto gconv = std::dynamic_pointer_cast<ngraph::op::GroupConvolution>(
+                            node->get_users()[0]))
                    {
-                        Shape weights_shape_groups = gconv->get_weights_dimensions();
-                        input_desc = mkldnn::memory::desc(
-                            mkldnn::memory::dims(weights_shape_groups.begin(),
-                                                 weights_shape_groups.end()),
-                            mkldnn_utils::get_mkldnn_data_type(args[0].get_element_type()),
-                            mkldnn::memory::format::goihw);
+                        weights_shape_groups = gconv->get_weights_dimensions();
                    }
+                    else if (auto gconvb =
+                                 std::dynamic_pointer_cast<ngraph::op::GroupConvolutionBias>(
+                                     node->get_users()[0]))
+                    {
+                        weights_shape_groups = gconvb->get_weights_dimensions();
+                    }
+                    else
+                    {
+                        throw ngraph_error("Incompatible input/output shape in ConvertLayout op");
+                    }
+                    input_desc = mkldnn::memory::desc(
+                        mkldnn::memory::dims(weights_shape_groups.begin(),
+                                             weights_shape_groups.end()),
+                        mkldnn_utils::get_mkldnn_data_type(args[0].get_element_type()),
+                        mkldnn::memory::format::goihw);
                }

                size_t reorder_index = mkldnn_emitter->build_reorder(input_desc, result_desc);

--- a/src/ngraph/runtime/cpu/builder/convolution.cpp
+++ b/src/ngraph/runtime/cpu/builder/convolution.cpp
@@ -23,6 +23,7 @@
 #include "ngraph/runtime/cpu/op/conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/group_conv.hpp"
+#include "ngraph/runtime/cpu/op/group_conv_bias.hpp"

 using namespace std;
 using namespace ngraph;
@@ -489,6 +490,81 @@ namespace ngraph
                }
            }

+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::GroupConvolutionBias)
+            {
+                auto& functors = external_function->get_functors();
+
+                auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name());
+                auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
+                auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name());
+                auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
+
+                auto convolution = static_cast<const ngraph::op::GroupConvolutionBias*>(node);
+
+                auto arg0_shape = args[0].get_shape();
+                auto arg1_shape = args[1].get_shape();
+                auto arg2_shape = args[2].get_shape();
+                auto result_shape = out[0].get_shape();
+
+                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
+                {
+                    Strides window_dilation_strides_adjusted;
+                    for (size_t s : convolution->get_window_dilation_strides())
+                    {
+                        window_dilation_strides_adjusted.push_back(s - 1);
+                    }
+
+                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+
+                    auto input_data_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
+                    auto weights_desc = mkldnn_utils::get_input_mkldnn_md(node, 1);
+                    auto bias_desc = mkldnn_utils::get_input_mkldnn_md(node, 2);
+                    auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
+
+                    auto padding_below = convolution->get_padding_below();
+                    auto padding_above = convolution->get_padding_above();
+                    auto filter_strides = convolution->get_window_movement_strides();
+
+                    const float ops_scale = 1.f;
+                    const float ops_alpha = -0.f; // relu negative slope
+                    const float ops_beta = 0.f;
+
+                    mkldnn::post_ops ops;
+                    if (convolution->with_relu())
+                    {
+                        ops.append_eltwise(
+                            ops_scale, mkldnn::algorithm::eltwise_relu, ops_alpha, ops_beta);
+                    }
+
+                    size_t conv_index =
+                        mkldnn_emitter->build_convolution_forward(input_data_desc,
+                                                                  weights_desc,
+                                                                  bias_desc,
+                                                                  result_desc,
+                                                                  filter_strides,
+                                                                  window_dilation_strides_adjusted,
+                                                                  padding_below,
+                                                                  padding_above,
+                                                                  ops);
+
+                    auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
+
+                    auto functor = [&, conv_index](CPURuntimeContext* ctx) {
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], arg2_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[3], out_tensor);
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, conv_index);
+                    };
+                    functors.emplace_back(functor);
+                }
+                else
+                {
+                    throw ngraph_error("unsupported parameters for GroupConvolutionBias");
+                }
+            }
+
            REGISTER_OP_BUILDER(Convolution);
            REGISTER_OP_BUILDER(ConvolutionRelu);
            REGISTER_OP_BUILDER(ConvolutionBias);
@@ -498,6 +574,7 @@ namespace ngraph
            REGISTER_OP_BUILDER(ConvolutionBiasBackpropFiltersBias);
            REGISTER_OP_BUILDER(GroupConvolution);
            REGISTER_OP_BUILDER(ConvolutionAdd);
+            REGISTER_OP_BUILDER(GroupConvolutionBias);
        }
    }
 }
--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -109,6 +109,7 @@
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/convert_layout.hpp"
 #include "ngraph/runtime/cpu/op/group_conv.hpp"
+#include "ngraph/runtime/cpu/op/group_conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/loop_kernel.hpp"
 #include "ngraph/runtime/cpu/op/lstm.hpp"
 #include "ngraph/runtime/cpu/op/matmul_bias.hpp"
@@ -2828,6 +2829,77 @@ namespace ngraph
                }
            }

+            template <>
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::GroupConvolutionBias)
+            {
+                auto convolution = static_cast<const ngraph::op::GroupConvolutionBias*>(node);
+
+                auto arg0_shape = args[0].get_shape();
+                auto arg1_shape = args[1].get_shape();
+                auto arg2_shape = args[2].get_shape();
+                auto result_shape = out[0].get_shape();
+
+                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
+                {
+                    Strides window_dilation_strides_adjusted;
+                    for (size_t s : convolution->get_window_dilation_strides())
+                    {
+                        window_dilation_strides_adjusted.push_back(s - 1);
+                    }
+
+                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+
+                    auto input_data_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
+                    auto weights_desc = mkldnn_utils::get_input_mkldnn_md(node, 1);
+                    auto bias_desc = mkldnn_utils::get_input_mkldnn_md(node, 2);
+                    auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
+
+                    auto padding_below = convolution->get_padding_below();
+                    auto padding_above = convolution->get_padding_above();
+                    auto filter_strides = convolution->get_window_movement_strides();
+
+                    const float ops_scale = 1.f;
+                    const float ops_alpha = -0.f; // relu negative slope
+                    const float ops_beta = 0.f;
+
+                    mkldnn::post_ops ops;
+                    if (convolution->with_relu())
+                    {
+                        ops.append_eltwise(
+                            ops_scale, mkldnn::algorithm::eltwise_relu, ops_alpha, ops_beta);
+                    }
+
+                    size_t conv_index =
+                        mkldnn_emitter->build_convolution_forward(input_data_desc,
+                                                                  weights_desc,
+                                                                  bias_desc,
+                                                                  result_desc,
+                                                                  filter_strides,
+                                                                  window_dilation_strides_adjusted,
+                                                                  padding_below,
+                                                                  padding_above,
+                                                                  ops);
+
+                    // invoke group convolution bias
+                    auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
+                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0])
+                           << ", " << args[0].get_name() << ");\n";
+                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1])
+                           << ", " << args[1].get_name() << ");\n";
+                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[2])
+                           << ", " << args[2].get_name() << ");\n";
+                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[3])
+                           << ", " << out[0].get_name() << ");\n";
+
+                    writer << "cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, "
+                           << to_string(conv_index) << ");\n";
+                }
+                else
+                {
+                    throw ngraph_error("Unsupported parameters for GroupConvolutionBias");
+                }
+            }
+
            template <>
            void CPU_Emitter::EMITTER_DECL(ngraph::op::Convolution)
            {
@@ -3920,17 +3992,27 @@ namespace ngraph
                         result_desc.data.ndims == 5 /*Goihw16g/Goihw8g/etc*/ &&
                         node->get_users().size() == 1)
                {
-                    auto gconv = std::dynamic_pointer_cast<ngraph::op::GroupConvolution>(
-                        *(begin(node->get_users())));
-                    if (gconv)
+                    Shape weights_shape_groups;
+                    if (auto gconv = std::dynamic_pointer_cast<ngraph::op::GroupConvolution>(
+                            node->get_users()[0]))
+                    {
+                        weights_shape_groups = gconv->get_weights_dimensions();
+                    }
+                    else if (auto gconvb =
+                                 std::dynamic_pointer_cast<ngraph::op::GroupConvolutionBias>(
+                                     node->get_users()[0]))
+                    {
+                        weights_shape_groups = gconvb->get_weights_dimensions();
+                    }
+                    else
                    {
-                        Shape weights_shape_groups = gconv->get_weights_dimensions();
-                        input_desc = mkldnn::memory::desc(
-                            mkldnn::memory::dims(weights_shape_groups.begin(),
-                                                 weights_shape_groups.end()),
-                            mkldnn_utils::get_mkldnn_data_type(args[0].get_element_type()),
-                            mkldnn::memory::format::goihw);
+                        throw ngraph_error("Incompatible input/output shape in ConvertLayout op");
                    }
+                    input_desc = mkldnn::memory::desc(
+                        mkldnn::memory::dims(weights_shape_groups.begin(),
+                                             weights_shape_groups.end()),
+                        mkldnn_utils::get_mkldnn_data_type(args[0].get_element_type()),
+                        mkldnn::memory::format::goihw);
                }

                size_t reorder_index = mkldnn_emitter->build_reorder(input_desc, result_desc);

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -145,6 +145,7 @@
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/convert_layout.hpp"
 #include "ngraph/runtime/cpu/op/group_conv.hpp"
+#include "ngraph/runtime/cpu/op/group_conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/loop_kernel.hpp"
 #include "ngraph/runtime/cpu/op/lstm.hpp"
 #include "ngraph/runtime/cpu/op/matmul_bias.hpp"
@@ -360,6 +361,8 @@ static const runtime::cpu::OpMap dispatcher{
    {TI(ngraph::op::ConvolutionAdd), &runtime::cpu::CPU_Emitter::emit<op::ConvolutionAdd>},
    {TI(ngraph::op::Quantize), &runtime::cpu::CPU_Emitter::emit<op::Quantize>},
    {TI(ngraph::op::Dequantize), &runtime::cpu::CPU_Emitter::emit<op::Dequantize>},
+    {TI(ngraph::op::GroupConvolutionBias),
+     &runtime::cpu::CPU_Emitter::emit<op::GroupConvolutionBias>},

 };


--- a/src/ngraph/runtime/cpu/cpu_visualize_tree.cpp
+++ b/src/ngraph/runtime/cpu/cpu_visualize_tree.cpp
@@ -15,6 +15,8 @@
 //*****************************************************************************

 #include "cpu_visualize_tree.hpp"
+#include "ngraph/op/reshape.hpp"
+#include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
 #include "ngraph/runtime/cpu/cpu_op_annotations.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
 #include "ngraph/runtime/cpu/op/convert_layout.hpp"
@@ -37,6 +39,38 @@ static void visualize_convert_layout(const Node& node, ostream& ss)
    ss << " ";
 }

+static void visualize_reshape(const Node& node, ostream& ss)
+{
+    try
+    {
+        auto input_desc = node.get_inputs().at(0).get_tensor().get_tensor_layout();
+        auto result_desc = node.get_output_tensor_ptr()->get_tensor_layout();
+
+        auto in_tvl = static_pointer_cast<runtime::cpu::LayoutDescriptor>(input_desc);
+        auto out_tvl = static_pointer_cast<runtime::cpu::LayoutDescriptor>(result_desc);
+
+        if (!in_tvl || !out_tvl)
+        {
+            return;
+        }
+        if (!in_tvl->is_mkldnn_layout() || !out_tvl->is_mkldnn_layout())
+        {
+            return;
+        }
+        ss << "\nin="
+           << runtime::cpu::mkldnn_utils::get_mkldnn_format_string(
+                  static_cast<mkldnn::memory::format>(in_tvl->get_mkldnn_md().data.format));
+        ss << " out="
+           << runtime::cpu::mkldnn_utils::get_mkldnn_format_string(
+                  static_cast<mkldnn::memory::format>(out_tvl->get_mkldnn_md().data.format));
+        ss << " ";
+    }
+    catch (...)
+    {
+        NGRAPH_DEBUG << "Exception in visualize_reshape \n";
+    }
+}
+
 namespace ngraph
 {
    namespace runtime
@@ -46,7 +80,8 @@ namespace ngraph
            const visualize_tree_ops_map_t& get_visualize_tree_ops_map()
            {
                const static visualize_tree_ops_map_t vtom{
-                    {TI(runtime::cpu::op::ConvertLayout), visualize_convert_layout}};
+                    {TI(runtime::cpu::op::ConvertLayout), visualize_convert_layout},
+                    {TI(ngraph::op::Reshape), visualize_reshape}};
                return vtom;
            }
        }

--- a/src/ngraph/runtime/cpu/mkldnn_utils.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.cpp
@@ -40,32 +40,6 @@ using namespace mkldnn;
 using namespace ngraph;
 using namespace std;

-#define TI(x) std::type_index(typeid(x))
-
-std::unordered_set<std::type_index>& runtime::cpu::mkldnn_utils::get_op_registry()
-{
-    static std::unordered_set<std::type_index> s_op_registry{
-        TI(ngraph::op::Add),
-        TI(ngraph::op::AvgPool),
-        TI(ngraph::op::AvgPoolBackprop),
-        TI(ngraph::op::BatchNormTraining),
-        TI(ngraph::op::BatchNormInference),
-        TI(ngraph::op::BatchNormTrainingBackprop),
-        TI(ngraph::op::Concat),
-        TI(ngraph::op::Convolution),
-        TI(ngraph::op::ConvolutionBackpropData),
-        TI(ngraph::op::ConvolutionBackpropFilters),
-        TI(ngraph::op::ConvolutionBias),
-        TI(ngraph::op::ConvolutionRelu),
-        TI(ngraph::op::ConvolutionBiasBackpropFiltersBias),
-        TI(ngraph::op::MaxPool),
-        TI(ngraph::op::MaxPoolBackprop),
-        TI(ngraph::op::Relu),
-        TI(ngraph::op::ReluBackprop),
-        TI(ngraph::op::Reshape)};
-    return s_op_registry;
-}
-
 std::map<element::Type, const mkldnn::memory::data_type>&
    runtime::cpu::mkldnn_utils::get_mkldnn_data_type_map()
 {
@@ -107,7 +81,6 @@ std::map<element::Type, const std::string>&
 std::map<memory::format, const std::string>&
    runtime::cpu::mkldnn_utils::get_mkldnn_format_string_map()
 {
-    // TODO (jbobba): Add the rest of memory formats to this map as well
    static std::map<memory::format, const std::string> s_mkldnn_format_string_map{
        {memory::format::format_undef, "memory::format::format_undef"},
        {memory::format::any, "memory::format::any"},
@@ -119,37 +92,78 @@ std::map<memory::format, const std::string>&
        {memory::format::chwn, "memory::format::chwn"},
        {memory::format::nChw8c, "memory::format::nChw8c"},
        {memory::format::nChw16c, "memory::format::nChw16c"},
-        {memory::format::ncdhw, "memory::format::ndhwc"},
-        {memory::format::ncdhw, "memory::format::ndhwc"},
+        {memory::format::ncdhw, "memory::format::ncdhw"},
+        {memory::format::ndhwc, "memory::format::ndhwc"},
+        {memory::format::nCdhw8c, "memory::format::nCdhw8c"},
        {memory::format::nCdhw16c, "memory::format::nCdhw16c"},
        {memory::format::oi, "memory::format::oi"},
        {memory::format::io, "memory::format::io"},
        {memory::format::oihw, "memory::format::oihw"},
        {memory::format::ihwo, "memory::format::ihwo"},
        {memory::format::hwio, "memory::format::hwio"},
-        // TODO (nishant): Uncomment after the next release of mkl-dnn"
-        //{memory::format::dhwio, "memory::format::dhwio"},
+        {memory::format::dhwio, "memory::format::dhwio"},
        {memory::format::oidhw, "memory::format::oidhw"},
+        {memory::format::OIdhw8i8o, "memory::format::OIdhw8i8o"},
+        {memory::format::OIdhw8o8i, "memory::format::OIdhw8o8i"},
+        {memory::format::Odhwi8o, "memory::format::Odhwi8o"},
        {memory::format::OIdhw16i16o, "memory::format::OIdhw16i16o"},
        {memory::format::OIdhw16o16i, "memory::format::OIdhw16o16i"},
        {memory::format::Oidhw16o, "memory::format::Oidhw16o"},
        {memory::format::Odhwi16o, "memory::format::Odhwi16o"},
        {memory::format::oIhw8i, "memory::format::oIhw8i"},
        {memory::format::oIhw16i, "memory::format::oIhw16i"},
+        {memory::format::oIdhw8i, "memory::format::oIdhw8i"},
+        {memory::format::oIdhw16i, "memory::format::oIdhw16i"},
        {memory::format::OIhw8i8o, "memory::format::OIhw8i8o"},
        {memory::format::OIhw16i16o, "memory::format::OIhw16i16o"},
-        {memory::format::IOhw16o16i, "memory::format::IOhw16o16i"},
        {memory::format::OIhw8o8i, "memory::format::OIhw8o8i"},
        {memory::format::OIhw16o16i, "memory::format::OIhw16o16i"},
+        {memory::format::IOhw16o16i, "memory::format::IOhw16o16i"},
+        {memory::format::OIhw8i16o2i, "memory::format::OIhw8i16o2i"},
+        {memory::format::OIdhw8i16o2i, "memory::format::OIdhw8i16o2i"},
+        {memory::format::OIhw8o16i2o, "memory::format::OIhw8o16i2o"},
+        {memory::format::OIhw4i16o4i, "memory::format::OIhw4i16o4i"},
        {memory::format::Oihw8o, "memory::format::Oihw8o"},
        {memory::format::Oihw16o, "memory::format::Oihw16o"},
        {memory::format::Ohwi8o, "memory::format::Ohwi8o"},
        {memory::format::Ohwi16o, "memory::format::Ohwi16o"},
        {memory::format::OhIw16o4i, "memory::format::OhIw16o4i"},
+        {memory::format::goihw, "memory::format::goihw"},
+        {memory::format::hwigo, "memory::format::hwigo"},
+        {memory::format::gOIdhw8i8o, "memory::format::gOIdhw8i8o"},
+        {memory::format::gOIdhw8o8i, "memory::format::gOIdhw8o8i"},
+        {memory::format::gOdhwi8o, "memory::format::gOdhwi8o"},
+        {memory::format::gOIhw8i8o, "memory::format::gOIhw8i8o"},
+        {memory::format::gOIhw16i16o, "memory::format::gOIhw16i16o"},
+        {memory::format::gOIhw8i16o2i, "memory::format::gOIhw8i16o2i"},
+        {memory::format::gOIdhw8i16o2i, "memory::format::gOIdhw8i16o2i"},
+        {memory::format::gOIhw8o16i2o, "memory::format::gOIhw8o16i2o"},
+        {memory::format::gOIhw4i16o4i, "memory::format::gOIhw4i16o4i"},
+        {memory::format::gOihw8o, "memory::format::gOihw8o"},
+        {memory::format::gOihw16o, "memory::format::gOihw16o"},
+        {memory::format::gOhwi8o, "memory::format::gOhwi8o"},
+        {memory::format::gOhwi16o, "memory::format::gOhwi16o"},
+        {memory::format::Goihw8g, "memory::format::Goihw8g"},
+        {memory::format::Goihw16g, "memory::format::Goihw16g"},
+        {memory::format::gOIhw8o8i, "memory::format::gOIhw8o8i"},
+        {memory::format::gOIhw16o16i, "memory::format::gOIhw16o16i"},
+        {memory::format::gIOhw16o16i, "memory::format::gIOhw16o16i"},
+        {memory::format::gOhIw16o4i, "memory::format::gOhIw16o4i"},
+        {memory::format::goidhw, "memory::format::goidhw"},
+        {memory::format::gOIdhw16i16o, "memory::format::gOIdhw16i16o"},
+        {memory::format::gOIdhw16o16i, "memory::format::gOIdhw16o16i"},
+        {memory::format::gOidhw16o, "memory::format::gOidhw16o"},
+        {memory::format::gOdhwi16o, "memory::format::gOdhwi16o"},
+        {memory::format::ntc, "memory::format::ntc"},
        {memory::format::tnc, "memory::format::tnc"},
        {memory::format::ldsnc, "memory::format::ldsnc"},
        {memory::format::ldigo, "memory::format::ldigo"},
+        {memory::format::ldigo_p, "memory::format::ldigo_p"},
+        {memory::format::ldgoi, "memory::format::ldgoi"},
+        {memory::format::ldgoi_p, "memory::format::ldgoi_p"},
        {memory::format::ldgo, "memory::format::ldgo"},
+        {memory::format::wino_fmt, "memory::format::wino_fmt"},
+        {memory::format::format_last, "memory::format::format_last"},
    };
    return s_mkldnn_format_string_map;
 }
@@ -181,10 +195,6 @@ std::set<memory::format>& runtime::cpu::mkldnn_utils::get_filter_formats()
        memory::format::OhIw16o4i};
    return s_filter_formats;
 }
-bool runtime::cpu::mkldnn_utils::IsMKLDNNOp(ngraph::Node& op)
-{
-    return (get_op_registry().find(TI(op)) != get_op_registry().end());
-}

 mkldnn::memory::format runtime::cpu::mkldnn_utils::CreateNativeDataFormat(
    const ngraph::runtime::cpu::LayoutDescriptor& layout)

--- a/src/ngraph/runtime/cpu/mkldnn_utils.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.hpp
@@ -33,7 +33,6 @@ namespace ngraph
            {
                extern mkldnn::engine global_cpu_engine;

-                bool IsMKLDNNOp(ngraph::Node& op);
                mkldnn::memory::format
                    CreateNativeDataFormat(const ngraph::runtime::cpu::LayoutDescriptor& layout);
                mkldnn::memory::format CreateNativeDataFormat(const Shape& shape);
@@ -73,7 +72,6 @@ namespace ngraph

                bool use_mkldnn_kernel(const ngraph::Node* node);

-                std::unordered_set<std::type_index>& get_op_registry();
                std::map<element::Type, const mkldnn::memory::data_type>&
                    get_mkldnn_data_type_map();
                std::map<element::Type, const std::string>& get_mkldnn_data_type_string_map();

--- a/src/ngraph/runtime/cpu/op/group_conv_bias.cpp
+++ b/src/ngraph/runtime/cpu/op/group_conv_bias.cpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include <numeric>
+
+#include "group_conv.hpp"
+#include "group_conv_bias.hpp"
+
+#include "ngraph/op/get_output_element.hpp"
+#include "ngraph/util.hpp"
+
+using namespace std;
+using namespace ngraph;
+
+static void validate_groupconvbias_shapes(const Shape& input_shape,
+                                          const Shape& filters_shape,
+                                          const Shape& bias_shape,
+                                          const Shape& output_shape,
+                                          size_t groups)
+{
+    // Input - N, C, H, W
+    // Filter - O, I, H, W
+    // Output - N, C, H, W
+    const size_t INPUT_C = 1;
+    const size_t FILTER_OC = 0;
+    const size_t FILTER_IC = 1;
+    const size_t OUTPUT_C = 1;
+
+    if (bias_shape.size() != 1)
+    {
+        throw ngraph_error("GroupConvolutionBias bias is expected to be 1D, but has shape: " +
+                           vector_to_string(bias_shape));
+    }
+
+    if (bias_shape[0] != filters_shape[FILTER_OC])
+    {
+        throw ngraph_error(
+            "GroupConvolutionBias bias element size does not match number of filters. bias_size "
+            "= " +
+            std::to_string(bias_shape[0]) + ", num_filters = " + std::to_string(filters_shape[0]));
+    }
+
+    if (input_shape[INPUT_C] != groups * filters_shape[FILTER_IC])
+    {
+        throw ngraph_error(
+            "Mismatch between GroupConvolutionBias input and filter channels: "
+            " data channels=" +
+            std::to_string(input_shape[INPUT_C]) + ", filter channels= " +
+            std::to_string(filters_shape[FILTER_IC]) + ", groups= " + std::to_string(groups));
+    }
+
+    if (output_shape[OUTPUT_C] != filters_shape[FILTER_OC])
+    {
+        throw ngraph_error(
+            "Mismatch between GroupConvolutionBias output and filter channels: "
+            " data channels=" +
+            std::to_string(output_shape[OUTPUT_C]) + ", filter channels= " +
+            std::to_string(filters_shape[FILTER_OC]));
+    }
+
+    if (output_shape[OUTPUT_C] % groups != 0)
+    {
+        throw ngraph_error(
+            "Output channels for GroupConvolutionBias not divisible by groups: channels=" +
+            std::to_string(output_shape[OUTPUT_C]) + ", groups= " + std::to_string(groups));
+    }
+}
+
+Shape op::GroupConvolutionBias::get_weights_dimensions()
+{
+    // reshape weights into 5d tensors that includes groups
+    const size_t OC = 0;
+    const size_t OC_IN_OUTPUT = 1;
+    const size_t IC = 1;
+
+    Shape weights_shape_groups{get_inputs().at(1).get_shape()};
+
+    weights_shape_groups.at(OC) = get_shape().at(OC_IN_OUTPUT) / get_groups();
+    weights_shape_groups.at(IC) = get_inputs().at(0).get_shape().at(IC) / get_groups();
+
+    // push_front the number of groups
+    weights_shape_groups.insert(weights_shape_groups.begin(), get_groups());
+    return weights_shape_groups;
+}
+
+op::GroupConvolutionBias::GroupConvolutionBias(const shared_ptr<op::GroupConvolution>& conv,
+                                               const shared_ptr<Node>& bias,
+                                               size_t groups,
+                                               const Shape& output_shape,
+                                               bool with_relu,
+                                               float alpha)
+    : Op("GroupConvolutionBias",
+         check_single_output_args({conv->get_argument(0), conv->get_argument(1), bias}))
+    , m_window_movement_strides(conv->get_window_movement_strides())
+    , m_window_dilation_strides(conv->get_window_dilation_strides())
+    , m_padding_below(conv->get_padding_below())
+    , m_padding_above(conv->get_padding_above())
+    , m_data_dilation_strides(conv->get_data_dilation_strides())
+    , m_with_relu(with_relu)
+    , m_groups(groups)
+    , m_alpha(alpha)
+{
+    constructor_validate_and_infer_types();
+
+    if (conv->get_element_type() != bias->get_element_type())
+    {
+        throw ngraph_error("GroupConvolution's element type isn't equal to bias!");
+    }
+
+    validate_groupconvbias_shapes(conv->get_argument(0)->get_shape(),
+                                  conv->get_argument(1)->get_shape(),
+                                  bias->get_shape(),
+                                  output_shape,
+                                  groups);
+
+    set_output_type(0, conv->get_element_type(), output_shape);
+}
+
+op::GroupConvolutionBias::GroupConvolutionBias(const shared_ptr<Node>& data_batch,
+                                               const shared_ptr<Node>& filters,
+                                               const shared_ptr<Node>& bias,
+                                               const Strides& window_movement_strides,
+                                               const Strides& window_dilation_strides,
+                                               const CoordinateDiff& padding_below,
+                                               const CoordinateDiff& padding_above,
+                                               const Strides& data_dilation_strides,
+                                               size_t groups,
+                                               const Shape& output_shape,
+                                               bool with_relu,
+                                               float alpha)
+    : Op("GroupConvolutionBias", check_single_output_args({data_batch, filters, bias}))
+    , m_window_movement_strides(window_movement_strides)
+    , m_window_dilation_strides(window_dilation_strides)
+    , m_padding_below(padding_below)
+    , m_padding_above(padding_above)
+    , m_data_dilation_strides(data_dilation_strides)
+    , m_with_relu(with_relu)
+    , m_groups(groups)
+    , m_alpha(alpha)
+{
+    constructor_validate_and_infer_types();
+
+    auto& data_batch_shape = data_batch->get_shape();
+    auto& data_batch_et = data_batch->get_element_type();
+    auto& filters_shape = filters->get_shape();
+    auto& filters_et = filters->get_element_type();
+
+    //
+    // Make sure data batch and filter element types match.
+    //
+    if (data_batch_et != filters_et)
+    {
+        throw ngraph_error("GroupConvolutionBias data batch and filter element types do not match");
+    }
+
+    validate_groupconvbias_shapes(
+        data_batch_shape, filters_shape, bias->get_shape(), output_shape, groups);
+
+    set_output_type(0, data_batch_et, output_shape);
+}
+
+shared_ptr<Node> op::GroupConvolutionBias::copy_with_new_args(const NodeVector& new_args) const
+{
+    if (new_args.size() != 3)
+    {
+        throw ngraph_error("Incorrect number of new arguments");
+    }
+
+    return shared_ptr<Node>(new GroupConvolutionBias(new_args.at(0),
+                                                     new_args.at(1),
+                                                     new_args.at(2),
+                                                     get_window_movement_strides(),
+                                                     get_window_dilation_strides(),
+                                                     get_padding_below(),
+                                                     get_padding_above(),
+                                                     get_data_dilation_strides(),
+                                                     get_groups(),
+                                                     get_output_shape(0),
+                                                     m_with_relu,
+                                                     get_alpha()));
+}
+
+void op::GroupConvolutionBias::generate_adjoints(autodiff::Adjoints& adjoints,
+                                                 const NodeVector& deltas)
+{
+    throw ngraph_error("GroupConvolutionBias generate_adjoints not supported implemented");
+}
--- a/src/ngraph/runtime/cpu/op/group_conv_bias.hpp
+++ b/src/ngraph/runtime/cpu/op/group_conv_bias.hpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#pragma once
+#include "group_conv.hpp"
+#include "ngraph/op/op.hpp"
+
+namespace ngraph
+{
+    namespace op
+    {
+        /// \brief GroupConvolution + Bias + Relu forward prop for
+        ///  batched GroupConvolution operation.
+        class GroupConvolutionBias : public Op
+        {
+        public:
+            GroupConvolutionBias(const std::shared_ptr<op::GroupConvolution>& conv,
+                                 const std::shared_ptr<Node>& bias,
+                                 const size_t groups,
+                                 const Shape& output_shape,
+                                 bool with_relu,
+                                 float alpha = 1.0);
+
+            GroupConvolutionBias(const std::shared_ptr<Node>& data_batch,
+                                 const std::shared_ptr<Node>& filters,
+                                 const std::shared_ptr<Node>& bias,
+                                 const Strides& window_movement_strides,
+                                 const Strides& window_dilation_strides,
+                                 const CoordinateDiff& padding_below,
+                                 const CoordinateDiff& padding_above,
+                                 const Strides& data_dilation_strides,
+                                 size_t groups,
+                                 const Shape& output_shape,
+                                 bool with_relu,
+                                 float alpha = 1.0);
+
+            Shape get_weights_dimensions();
+            const Strides& get_window_movement_strides() const { return m_window_movement_strides; }
+            const Strides& get_window_dilation_strides() const { return m_window_dilation_strides; }
+            const CoordinateDiff& get_padding_below() const { return m_padding_below; }
+            const CoordinateDiff& get_padding_above() const { return m_padding_above; }
+            const Strides& get_data_dilation_strides() const { return m_data_dilation_strides; }
+            std::shared_ptr<Node> get_bias() { return get_argument(2); }
+            std::shared_ptr<Node> get_filters() { return get_argument(1); }
+            std::shared_ptr<Node> get_data_batch() { return get_argument(0); }
+            size_t get_groups() const { return m_groups; }
+            bool with_relu() const { return m_with_relu; }
+            float get_alpha() const { return m_alpha; }
+            virtual std::shared_ptr<Node>
+                copy_with_new_args(const NodeVector& new_args) const override;
+
+            void generate_adjoints(autodiff::Adjoints& adjoints, const NodeVector& deltas) override;
+
+        protected:
+            Strides m_window_movement_strides;
+            Strides m_window_dilation_strides;
+            CoordinateDiff m_padding_below;
+            CoordinateDiff m_padding_above;
+            Strides m_data_dilation_strides;
+            bool m_with_relu;
+            size_t m_groups = 1;
+            float m_alpha = 1.0;
+        };
+    }
+}
--- a/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
@@ -52,6 +52,7 @@
 #include "ngraph/runtime/cpu/op/conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/group_conv.hpp"
+#include "ngraph/runtime/cpu/op/group_conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/lstm.hpp"
 #include "ngraph/runtime/cpu/op/max_pool_with_indices.hpp"
 #include "ngraph/runtime/cpu/op/rnn.hpp"
@@ -187,6 +188,20 @@ namespace ngraph
                    }
                }

+                template <>
+                void CPUAssignment::ASSIGN_DECL(ngraph::op::GroupConvolutionBias)
+                {
+                    auto convolution = static_cast<op::GroupConvolutionBias*>(node);
+
+                    if (can_use_mkldnn_conv<ngraph::op::GroupConvolutionBias>(node))
+                    {
+                        auto op_annotations =
+                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                        op_annotations->set_mkldnn_op(true);
+                        convolution->set_op_annotations(op_annotations);
+                    }
+                }
+
                template <>
                void CPUAssignment::ASSIGN_DECL(ngraph::op::ConvolutionRelu)
                {
@@ -929,6 +944,8 @@ static const runtime::cpu::pass::AssignOpMap s_dispatcher{
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::QuantizedConvolutionRelu>},
    {TI(ngraph::op::QuantizedConvolutionBias),
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::QuantizedConvolutionBias>},
+    {TI(ngraph::op::GroupConvolutionBias),
+     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::GroupConvolutionBias>},
    {TI(ngraph::op::Quantize), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Quantize>},
    {TI(ngraph::op::Dequantize),
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Dequantize>},

--- a/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
@@ -17,6 +17,7 @@
 #include <algorithm>
 #include <iostream>
 #include <numeric>
+#include <string>
 #include <unordered_set>

 #include "cpu_fusion.hpp"
@@ -56,6 +57,8 @@
 #include "ngraph/runtime/cpu/op/conv_add.hpp"
 #include "ngraph/runtime/cpu/op/conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
+#include "ngraph/runtime/cpu/op/group_conv.hpp"
+#include "ngraph/runtime/cpu/op/group_conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/matmul_bias.hpp"
 #include "ngraph/runtime/cpu/op/sigmoid_mul.hpp"
 #include "ngraph/util.hpp"
@@ -1636,3 +1639,160 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_conv_bias_affine_folding()
    auto m = std::make_shared<ngraph::pattern::Matcher>(multiply, callback);
    this->add_matcher(m);
 }
+
+void ngraph::runtime::cpu::pass::CPUFusion::construct_groupconv_batchnorm_global_stats_folding()
+{
+    Shape shape_a{1, 32, 2, 2};
+    Shape shape_b{32, 1, 1, 1};
+    Shape shape_r{1, 32, 2, 2};
+
+    auto input = std::make_shared<pattern::op::Label>(element::f32, shape_a);
+    auto filters = std::make_shared<pattern::op::Label>(element::f32, shape_b);
+    auto resShape = std::make_shared<pattern::op::Label>(element::f32, shape_r);
+
+    auto conv = std::make_shared<op::GroupConvolution>(input,
+                                                       filters,
+                                                       Strides{1, 1},
+                                                       Strides{1, 1},
+                                                       CoordinateDiff{0, 0},
+                                                       CoordinateDiff{0, 0},
+                                                       Strides{1, 1},
+                                                       32,
+                                                       shape_r);
+    auto conv_label = std::make_shared<pattern::op::Label>(conv, nullptr, NodeVector{conv});
+
+    auto mean = std::make_shared<pattern::op::Label>(element::f32, Shape{32});
+    auto var = std::make_shared<pattern::op::Label>(element::f32, Shape{32});
+    auto gamma = std::make_shared<pattern::op::Label>(element::f32, Shape{32});
+    auto beta = std::make_shared<pattern::op::Label>(element::f32, Shape{32});
+    double eps = 0.001;
+    auto bn = std::make_shared<op::BatchNormInference>(eps, gamma, beta, conv_label, mean, var);
+
+    ngraph::pattern::graph_rewrite_callback callback =
+        [input, filters, conv_label, mean, var, gamma, beta, eps](pattern::Matcher& m) {
+
+            NGRAPH_DEBUG << "In callback for groupconv BatchNorm folding against node = "
+                         << m.get_match_root()->get_name();
+            auto pattern_map = m.get_pattern_map();
+
+            auto m_bn = std::dynamic_pointer_cast<op::BatchNormInference>(m.get_match_root());
+            auto conv_m = std::static_pointer_cast<op::GroupConvolution>(pattern_map[conv_label]);
+
+            if (conv_m->get_users().size() > 1)
+            {
+                return false;
+            }
+
+            if (conv_m->get_shape().size() != 4)
+            {
+                return false;
+            }
+
+            if (conv_m->get_groups() == 0)
+            {
+                return false;
+            }
+
+            // new weights = old weights * gamma / sqrt(variance + epsilon)
+            // new biases = (-mean) * gamma / sqrt(variance + epsilon) + beta
+
+            auto bn_eps = op::Constant::create(element::f32, Shape{}, {m_bn->get_eps_value()});
+
+            auto var_eps = std::make_shared<op::Add>(
+                pattern_map[var],
+                std::make_shared<op::Broadcast>(bn_eps, pattern_map[var]->get_shape(), AxisSet{0}));
+            auto sqrt_var_eps = std::make_shared<op::Sqrt>(var_eps);
+
+            auto weight_scaling = std::make_shared<op::Divide>(pattern_map[gamma], sqrt_var_eps);
+
+            auto weight_scaling_bcast = std::make_shared<op::Broadcast>(
+                weight_scaling, pattern_map[filters]->get_shape(), AxisSet{1, 2, 3});
+
+            auto new_weights =
+                std::make_shared<op::Multiply>(pattern_map[filters], weight_scaling_bcast);
+            auto mean_gamma = std::make_shared<op::Multiply>(pattern_map[mean], weight_scaling);
+            auto new_biases = std::make_shared<op::Subtract>(pattern_map[beta], mean_gamma);
+
+            auto g_conv_bias =
+                std::make_shared<op::GroupConvolutionBias>(pattern_map[input],
+                                                           new_weights,
+                                                           new_biases,
+                                                           conv_m->get_window_movement_strides(),
+                                                           conv_m->get_window_dilation_strides(),
+                                                           conv_m->get_padding_below(),
+                                                           conv_m->get_padding_above(),
+                                                           conv_m->get_data_dilation_strides(),
+                                                           conv_m->get_groups(),
+                                                           conv_m->get_output_shape(0),
+                                                           false,
+                                                           1.0);
+            ngraph::replace_node(m.get_match_root(), g_conv_bias);
+
+            return true;
+        };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(bn, callback);
+    this->add_matcher(m);
+}
+
+void ngraph::runtime::cpu::pass::CPUFusion::
+    construct_groupconv_batchnorm_global_stats_folding_relu()
+{
+    Shape shape_a{1, 32, 2, 2};
+    Shape shape_b{32, 1, 1, 1};
+    Shape shape_r{1, 32, 2, 2};
+    Shape shape_bias{32};
+    Shape shape_num{0};
+
+    auto input = std::make_shared<pattern::op::Label>(element::f32, shape_a);
+    auto filters = std::make_shared<pattern::op::Label>(element::f32, shape_b);
+    auto bias = std::make_shared<pattern::op::Label>(element::f32, shape_bias);
+    auto num = std::make_shared<pattern::op::Label>(element::f32, shape_num);
+
+    auto conv = std::make_shared<op::GroupConvolutionBias>(input,
+                                                           filters,
+                                                           bias,
+                                                           Strides{1, 1},
+                                                           Strides{1, 1},
+                                                           CoordinateDiff{0, 0},
+                                                           CoordinateDiff{0, 0},
+                                                           Strides{1, 1},
+                                                           32,
+                                                           shape_r,
+                                                           false,
+                                                           1.0);
+    auto conv_label = std::make_shared<pattern::op::Label>(conv, nullptr, NodeVector{conv});
+
+    // GroupConv + BatchNorm + Relu -> GroupConvBias
+    auto prelu = std::make_shared<op::Relu>(conv_label);
+
+    ngraph::pattern::graph_rewrite_callback callback =
+        [input, filters, bias, num, conv_label, prelu](pattern::Matcher& m) {
+
+            NGRAPH_DEBUG << "In callback for GroupConvBias + Relu folding against node = "
+                         << m.get_match_root()->get_name();
+            auto pattern_map = m.get_pattern_map();
+
+            auto conv_m =
+                std::static_pointer_cast<op::GroupConvolutionBias>(pattern_map[conv_label]);
+            auto relu_m = std::dynamic_pointer_cast<op::Relu>(m.get_match_root());
+
+            auto g_conv_bias_relu =
+                std::make_shared<op::GroupConvolutionBias>(conv_m->get_argument(0),
+                                                           conv_m->get_argument(1),
+                                                           conv_m->get_argument(2),
+                                                           conv_m->get_window_movement_strides(),
+                                                           conv_m->get_window_dilation_strides(),
+                                                           conv_m->get_padding_below(),
+                                                           conv_m->get_padding_above(),
+                                                           conv_m->get_data_dilation_strides(),
+                                                           conv_m->get_groups(),
+                                                           conv_m->get_output_shape(0),
+                                                           true);
+            ngraph::replace_node(m.get_match_root(), g_conv_bias_relu);
+            return true;
+        };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(prelu, callback);
+    this->add_matcher(m);
+}
--- a/src/ngraph/runtime/cpu/pass/cpu_fusion.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_fusion.hpp
@@ -66,6 +66,8 @@ public:
            construct_conv_bias_bprop();
            construct_conv_bias_folded_batch_norm();
            construct_conv_bias_affine_folding();
+            construct_groupconv_batchnorm_global_stats_folding();
+            construct_groupconv_batchnorm_global_stats_folding_relu();
            construct_batch_norm_relu();
            construct_batch_norm_relu_global_stats();
            construct_conv_relu();
@@ -100,4 +102,6 @@ private:
    void construct_bounded_relu();
    void construct_conv_bias_folded_batch_norm();
    void construct_conv_bias_affine_folding();
+    void construct_groupconv_batchnorm_global_stats_folding();
+    void construct_groupconv_batchnorm_global_stats_folding_relu();
 };
--- a/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
@@ -57,6 +57,7 @@
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/convert_layout.hpp"
 #include "ngraph/runtime/cpu/op/group_conv.hpp"
+#include "ngraph/runtime/cpu/op/group_conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/lstm.hpp"
 #include "ngraph/runtime/cpu/op/max_pool_with_indices.hpp"
 #include "ngraph/runtime/cpu/op/rnn.hpp"
@@ -269,7 +270,7 @@ namespace ngraph
        {
            namespace pass
            {
-                template <typename T, bool use_bias, bool default_weights_format>
+                template <typename T, bool use_bias>
                void ConvolutionLayout(std::shared_ptr<ngraph::Node> node,
                                       vector<memory::desc>& i_mds,
                                       vector<memory::desc>& o_mds)
@@ -279,10 +280,16 @@ namespace ngraph
                    auto arg0_shape = node->get_input_shape(0);
                    auto arg1_shape = node->get_input_shape(1);

-                    if (default_weights_format)
+                    // Convert filters to MKLDNN shape
+                    // o,i,h,w -> g,o,i,h,w (e.g., {6, 2, 1, 1}, groups = 2 -> {2, 3, 1, 1, 1})
+                    if (auto gconv = std::dynamic_pointer_cast<ngraph::op::GroupConvolution>(node))
+                    {
+                        arg1_shape = gconv->get_weights_dimensions();
+                    }
+                    if (auto gconv =
+                            std::dynamic_pointer_cast<ngraph::op::GroupConvolutionBias>(node))
                    {
-                        arg1_shape = std::dynamic_pointer_cast<ngraph::op::GroupConvolution>(node)
-                                         ->get_weights_dimensions();
+                        arg1_shape = gconv->get_weights_dimensions();
                    }
                    auto result_shape = node->get_output_shape(0);
                    auto filter_strides = convolution->get_window_movement_strides();
@@ -325,8 +332,6 @@ namespace ngraph
                        memory::data_type et_bias =
                            mkldnn_utils::get_mkldnn_data_type(node->get_input_element_type(2));
                        auto arg2_shape = node->get_input_shape(2);
-                        ngraph::op::util::validate_convbias_shapes(
-                            arg0_shape, arg1_shape, arg2_shape);
                        memory::dims mkldnn_arg2_shape(arg2_shape.begin(), arg2_shape.end());
                        const memory::desc bias_desc(
                            mkldnn_arg2_shape, et_bias, memory::format::any);
@@ -393,7 +398,7 @@ namespace ngraph
                    {
                        vector<memory::desc> i_mds;
                        vector<memory::desc> o_mds;
-                        ConvolutionLayout<ngraph::op::QuantizedConvolution, false, false>(
+                        ConvolutionLayout<ngraph::op::QuantizedConvolution, false>(
                            node, i_mds, o_mds);

                        auto scale_input_md = mkldnn_utils::create_default_mkldnn_md(
@@ -416,8 +421,7 @@ namespace ngraph
                    {
                        vector<memory::desc> i_mds;
                        vector<memory::desc> o_mds;
-                        ConvolutionLayout<ngraph::op::Convolution, false, false>(
-                            node, i_mds, o_mds);
+                        ConvolutionLayout<ngraph::op::Convolution, false>(node, i_mds, o_mds);

                        node = insert_input_conversions(external_function, node, i_mds);
                        set_output_layouts(node, o_mds);
@@ -435,7 +439,25 @@ namespace ngraph
                    {
                        vector<memory::desc> i_mds;
                        vector<memory::desc> o_mds;
-                        ConvolutionLayout<ngraph::op::GroupConvolution, false, true>(
+                        ConvolutionLayout<ngraph::op::GroupConvolution, false>(node, i_mds, o_mds);
+
+                        node = insert_input_conversions(external_function, node, i_mds);
+                        set_output_layouts(node, o_mds);
+                    }
+                    else
+                    {
+                        set_native_layouts(external_function, node);
+                    }
+                }
+
+                template <>
+                void CPULayout::LAYOUT_DECL(ngraph::op::GroupConvolutionBias)
+                {
+                    if (mkldnn_utils::use_mkldnn_kernel(node.get()))
+                    {
+                        vector<memory::desc> i_mds;
+                        vector<memory::desc> o_mds;
+                        ConvolutionLayout<ngraph::op::GroupConvolutionBias, true>(
                            node, i_mds, o_mds);

                        node = insert_input_conversions(external_function, node, i_mds);
@@ -454,8 +476,7 @@ namespace ngraph
                    {
                        vector<memory::desc> i_mds;
                        vector<memory::desc> o_mds;
-                        ConvolutionLayout<ngraph::op::ConvolutionBias, true, false>(
-                            node, i_mds, o_mds);
+                        ConvolutionLayout<ngraph::op::ConvolutionBias, true>(node, i_mds, o_mds);
                        node = insert_input_conversions(external_function, node, i_mds);
                        set_output_layouts(node, o_mds);
                    }
@@ -472,7 +493,7 @@ namespace ngraph
                    {
                        vector<memory::desc> i_mds;
                        vector<memory::desc> o_mds;
-                        ConvolutionLayout<ngraph::op::QuantizedConvolutionBias, true, false>(
+                        ConvolutionLayout<ngraph::op::QuantizedConvolutionBias, true>(
                            node, i_mds, o_mds);

                        auto scale_input_md = mkldnn_utils::create_default_mkldnn_md(
@@ -496,8 +517,7 @@ namespace ngraph
                    {
                        vector<memory::desc> i_mds;
                        vector<memory::desc> o_mds;
-                        ConvolutionLayout<ngraph::op::ConvolutionRelu, false, false>(
-                            node, i_mds, o_mds);
+                        ConvolutionLayout<ngraph::op::ConvolutionRelu, false>(node, i_mds, o_mds);
                        node = insert_input_conversions(external_function, node, i_mds);
                        set_output_layouts(node, o_mds);
                    }
@@ -514,7 +534,7 @@ namespace ngraph
                    {
                        vector<memory::desc> i_mds;
                        vector<memory::desc> o_mds;
-                        ConvolutionLayout<ngraph::op::QuantizedConvolutionRelu, false, false>(
+                        ConvolutionLayout<ngraph::op::QuantizedConvolutionRelu, false>(
                            node, i_mds, o_mds);

                        auto scale_input_md = mkldnn_utils::create_default_mkldnn_md(
@@ -538,8 +558,7 @@ namespace ngraph
                    {
                        vector<memory::desc> i_mds;
                        vector<memory::desc> o_mds;
-                        ConvolutionLayout<ngraph::op::ConvolutionBiasAdd, true, false>(
-                            node, i_mds, o_mds);
+                        ConvolutionLayout<ngraph::op::ConvolutionBiasAdd, true>(node, i_mds, o_mds);
                        // Force second input to sum to use the same layout as convolution output
                        i_mds.push_back(o_mds[0]);
                        node = insert_input_conversions(external_function, node, i_mds);
@@ -558,8 +577,7 @@ namespace ngraph
                    {
                        vector<memory::desc> i_mds;
                        vector<memory::desc> o_mds;
-                        ConvolutionLayout<ngraph::op::ConvolutionAdd, false, false>(
-                            node, i_mds, o_mds);
+                        ConvolutionLayout<ngraph::op::ConvolutionAdd, false>(node, i_mds, o_mds);
                        // Force second input to sum to use the same layout as convolution output
                        i_mds.push_back(o_mds[0]);
                        node = insert_input_conversions(external_function, node, i_mds);
@@ -1949,6 +1967,8 @@ static const runtime::cpu::pass::LayoutOpMap s_dispatcher{
     &runtime::cpu::pass::CPULayout::layout<ngraph::op::QuantizedConvolutionRelu>},
    {TI(ngraph::op::QuantizedConvolutionBias),
     &runtime::cpu::pass::CPULayout::layout<ngraph::op::QuantizedConvolutionBias>},
+    {TI(ngraph::op::GroupConvolutionBias),
+     &runtime::cpu::pass::CPULayout::layout<ngraph::op::GroupConvolutionBias>},
 };

 bool runtime::cpu::pass::CPULayout::run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes)

--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp
@@ -55,6 +55,7 @@
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/convert_layout.hpp"
 #include "ngraph/runtime/cpu/op/group_conv.hpp"
+#include "ngraph/runtime/cpu/op/group_conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/loop_kernel.hpp"
 #include "ngraph/runtime/cpu/op/lstm.hpp"
 #include "ngraph/runtime/cpu/op/matmul_bias.hpp"
@@ -1051,6 +1052,145 @@ TEST(cpu_fusion, conv_add)
    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
 }

+shared_ptr<Function> gen_groupconv_batchnorm(const bool add_goe,
+                                             const bool with_relu,
+                                             const Shape shape_in,
+                                             const Shape shape_weights,
+                                             const Shape shape_out,
+                                             const size_t groups)
+{
+    auto input = make_shared<op::Parameter>(element::f32, shape_in);
+    auto weights = make_shared<op::Parameter>(element::f32, shape_weights);
+
+    unsigned long OC = shape_out.at(1);
+    Shape shape_bn{OC};
+    auto group_conv = make_shared<op::GroupConvolution>(input,
+                                                        weights,
+                                                        Strides{1, 1},
+                                                        Strides{1, 1},
+                                                        CoordinateDiff{0, 0},
+                                                        CoordinateDiff{0, 0},
+                                                        Strides{1, 1},
+                                                        groups,
+                                                        shape_out);
+
+    double eps = 0.001;
+    auto gamma = std::make_shared<op::Parameter>(element::f32, shape_bn);
+    auto beta = std::make_shared<op::Parameter>(element::f32, shape_bn);
+    auto mean = std::make_shared<op::Parameter>(element::f32, shape_bn);
+    auto var = std::make_shared<op::Parameter>(element::f32, shape_bn);
+
+    auto goe_bn = std::make_shared<op::GetOutputElement>(group_conv, 0);
+
+    // Adding a goe will stop fusion since the patterns wont expect to see this op
+    auto bn =
+        add_goe ? std::make_shared<op::BatchNormInference>(eps, gamma, beta, goe_bn, mean, var)
+                : std::make_shared<op::BatchNormInference>(eps, gamma, beta, group_conv, mean, var);
+    if (with_relu)
+    {
+        auto prelu = std::make_shared<op::Relu>(bn);
+        auto f = make_shared<Function>(NodeVector{prelu},
+                                       op::ParameterVector{input, weights, gamma, beta, mean, var});
+        return f;
+    }
+    else
+    {
+        auto f = make_shared<Function>(NodeVector{bn},
+                                       op::ParameterVector{input, weights, gamma, beta, mean, var});
+        return f;
+    }
+}
+
+void fuse_groupconv_batchnorm_helper(Shape shape_in,
+                                     Shape shape_weights,
+                                     Shape shape_r,
+                                     size_t groups)
+{
+    auto func_fuse =
+        gen_groupconv_batchnorm(false, false, shape_in, shape_weights, shape_r, groups);
+    auto func_fuse2 =
+        gen_groupconv_batchnorm(false, true, shape_in, shape_weights, shape_r, groups);
+
+    {
+        pass::Manager pass_manager;
+        pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
+        pass_manager.run_passes(func_fuse);
+        ASSERT_EQ(count_ops_of_type<op::GroupConvolutionBias>(func_fuse), 1);
+    }
+
+    {
+        // test groupconv + batchnorm + relu fusion
+        pass::Manager pass_manager;
+        pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
+        pass_manager.run_passes(func_fuse2);
+        ASSERT_EQ(count_ops_of_type<op::GroupConvolutionBias>(func_fuse2), 1);
+        ASSERT_EQ(count_ops_of_type<op::Relu>(func_fuse2), 0);
+    }
+}
+
+void groupconv_batchnorm_test_val_helper(
+    const bool with_relu, Shape shape_in, Shape shape_weights, Shape shape_r, size_t groups)
+{
+    shared_ptr<Function> fuse_func =
+        gen_groupconv_batchnorm(false, with_relu, shape_in, shape_weights, shape_r, groups);
+    shared_ptr<Function> nofuse_func =
+        gen_groupconv_batchnorm(true, with_relu, shape_in, shape_weights, shape_r, groups);
+
+    test::Uniform<float> rng(1.0f, 100.0f);
+    vector<vector<float>> args;
+    for (shared_ptr<op::Parameter> param : fuse_func->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+
+    auto fuse_results = execute(fuse_func, args, "CPU");
+    auto nofuse_results = execute(nofuse_func, args, "CPU");
+
+    EXPECT_TRUE(test::all_close(fuse_results.at(0), nofuse_results.at(0)));
+}
+
+TEST(cpu_fusion, fuse_groupconv_batchnorm1)
+{
+    Shape shape_in{1, 20, 5, 5};
+    Shape shape_weights{8, 10, 3, 3};
+    Shape shape_r{1, 8, 3, 3};
+    fuse_groupconv_batchnorm_helper(shape_in, shape_weights, shape_r, 2);
+    groupconv_batchnorm_test_val_helper(false, shape_in, shape_weights, shape_r, 2);
+    groupconv_batchnorm_test_val_helper(true, shape_in, shape_weights, shape_r, 2);
+}
+
+TEST(cpu_fusion, fuse_groupconv_batchnorm2)
+{
+    Shape shape_in{1, 20, 5, 5};
+    Shape shape_weights{5, 4, 3, 3};
+    Shape shape_r{1, 5, 3, 3};
+    fuse_groupconv_batchnorm_helper(shape_in, shape_weights, shape_r, 5);
+    groupconv_batchnorm_test_val_helper(false, shape_in, shape_weights, shape_r, 5);
+    groupconv_batchnorm_test_val_helper(true, shape_in, shape_weights, shape_r, 5);
+}
+
+TEST(cpu_fusion, fuse_groupconv_batchnorm3)
+{
+    Shape shape_in{1, 20, 5, 5};
+    Shape shape_weights{20, 1, 3, 3};
+    Shape shape_r{1, 20, 3, 3};
+    fuse_groupconv_batchnorm_helper(shape_in, shape_weights, shape_r, 20);
+    groupconv_batchnorm_test_val_helper(false, shape_in, shape_weights, shape_r, 20);
+    groupconv_batchnorm_test_val_helper(true, shape_in, shape_weights, shape_r, 20);
+}
+
+TEST(cpu_fusion, fuse_groupconv_batchnorm4)
+{
+    Shape shape_in{1, 20, 4, 4};
+    Shape shape_weights{5, 20, 1, 1};
+    Shape shape_r{1, 5, 4, 4};
+    fuse_groupconv_batchnorm_helper(shape_in, shape_weights, shape_r, 1);
+    groupconv_batchnorm_test_val_helper(false, shape_in, shape_weights, shape_r, 1);
+    groupconv_batchnorm_test_val_helper(true, shape_in, shape_weights, shape_r, 1);
+}
+
 std::vector<shared_ptr<runtime::Tensor>> rnn_matrix_fusion_eval(const size_t time_steps,
                                                                const Shape& data_shape,
                                                                const Shape& weights_shape,