Add conv add fusion (#1526)

* Add conv add fusion * Updated file permissions and cpu_fusion order * Formatted code using maint/apply-code-format.sh * Fixed minor review comments * Use NODE_VALIDATION_ASSERT instead of throw ngraph_error;\nupgrade baseline and fix issues * Some more fixes

Add conv add fusion (#1526)
* Add conv add fusion * Updated file permissions and cpu_fusion order * Formatted code using maint/apply-code-format.sh * Fixed minor review comments * Use NODE_VALIDATION_ASSERT instead of throw ngraph_error;\nupgrade baseline and fix issues * Some more fixes
37174c90 · gaurides · Scott Cyphers · 36e1de51 · 37174c90 · 37174c90
Commit 37174c90 authored Sep 11, 2018 by gaurides Committed by Scott Cyphers Sep 11, 2018
12 changed files
--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -85,6 +85,7 @@ set(SRC
    op/max_pool_with_indices.cpp
    op/rnn.cpp
    op/sigmoid_mul.cpp
+    op/conv_add.cpp
    pass/cpu_assignment.cpp
    pass/cpu_collapse_dims.cpp
    pass/cpu_concat_inputs.cpp

--- a/src/ngraph/runtime/cpu/builder/convolution.cpp
+++ b/src/ngraph/runtime/cpu/builder/convolution.cpp
@@ -19,6 +19,7 @@
 #include "ngraph/runtime/cpu/kernel/convolution.hpp"
 #include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+#include "ngraph/runtime/cpu/op/conv_add.hpp"
 #include "ngraph/runtime/cpu/op/conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/group_conv.hpp"
@@ -205,6 +206,36 @@ namespace ngraph
                }
            }
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::ConvolutionAdd)
+            {
+                auto& functors = external_function->get_functors();
+                auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name());
+                auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
+                auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
+                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
+                {
+                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                    auto conv_index = mkldnn_emitter->build_convolution<ngraph::op::ConvolutionAdd>(
+                        node, args, out);
+                    auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
+                    auto functor = [&, conv_index](CPURuntimeContext* ctx) {
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out_tensor);
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, conv_index);
+                    };
+                    functors.emplace_back(functor);
+                }
+                else
+                {
+                    throw ngraph_error("ConvolutionAdd is only supported with MKLDNN kernel.");
+                }
+            }
            template <>
            void Builder::BUILDER_DECL(ngraph::op::ConvolutionBackpropData)
            {
@@ -516,6 +547,7 @@ namespace ngraph
            REGISTER_OP_BUILDER(ConvolutionBackpropFilters);
            REGISTER_OP_BUILDER(ConvolutionBiasBackpropFiltersBias);
            REGISTER_OP_BUILDER(GroupConvolution);
+            REGISTER_OP_BUILDER(ConvolutionAdd);
        }
    }
 }
--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -99,6 +99,7 @@
 #include "ngraph/runtime/cpu/op/batch_dot.hpp"
 #include "ngraph/runtime/cpu/op/batch_norm_relu.hpp"
 #include "ngraph/runtime/cpu/op/bounded_relu.hpp"
+#include "ngraph/runtime/cpu/op/conv_add.hpp"
 #include "ngraph/runtime/cpu/op/conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/convert_layout.hpp"
@@ -2960,6 +2961,31 @@ namespace ngraph
                }
            }
+            template <>
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::ConvolutionAdd)
+            {
+                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
+                {
+                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                    auto conv_index = mkldnn_emitter->build_convolution<ngraph::op::ConvolutionAdd>(
+                        node, args, out);
+                    auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
+                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0])
+                           << ", " << args[0].get_name() << ");\n";
+                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1])
+                           << ", " << args[1].get_name() << ");\n";
+                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[2])
+                           << ", " << out[0].get_name() << ");\n";
+                    writer << "cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, "
+                           << to_string(conv_index) << ");\n";
+                }
+                else
+                {
+                    throw ngraph_error("ConvolutionAdd is only supported with MKLDNN kernel.");
+                }
+            }
            template <>
            void CPU_Emitter::EMITTER_DECL(ngraph::op::ConvolutionBiasBackpropFiltersBias)
            {

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -139,6 +139,7 @@
 #include "ngraph/runtime/cpu/op/batch_dot.hpp"
 #include "ngraph/runtime/cpu/op/batch_norm_relu.hpp"
 #include "ngraph/runtime/cpu/op/bounded_relu.hpp"
+#include "ngraph/runtime/cpu/op/conv_add.hpp"
 #include "ngraph/runtime/cpu/op/conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/convert_layout.hpp"
@@ -343,6 +344,8 @@ static const runtime::cpu::OpMap dispatcher{
    {TI(ngraph::runtime::cpu::op::LoopKernel),
     &runtime::cpu::CPU_Emitter::emit<runtime::cpu::op::LoopKernel>},
    {TI(ngraph::op::LRN), &runtime::cpu::CPU_Emitter::emit<ngraph::op::LRN>},
+    {TI(ngraph::op::ConvolutionAdd), &runtime::cpu::CPU_Emitter::emit<op::ConvolutionAdd>},
 };
 static void

--- a/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
@@ -29,6 +29,7 @@
 #include "ngraph/runtime/cpu/cpu_tensor_view_wrapper.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
 #include "ngraph/runtime/cpu/op/bounded_relu.hpp"
+#include "ngraph/runtime/cpu/op/conv_add.hpp"
 #include "ngraph/runtime/cpu/op/conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
 #include "ngraph/shape.hpp"
@@ -132,7 +133,8 @@ namespace ngraph
                    mkldnn::post_ops ops;
-                    if (std::is_same<OP, ngraph::op::ConvolutionBiasAdd>())
+                    if (std::is_same<OP, ngraph::op::ConvolutionBiasAdd>() ||
+                        std::is_same<OP, ngraph::op::ConvolutionAdd>())
                    {
                        ops.append_sum(1.f);
                    }
@@ -148,6 +150,11 @@ namespace ngraph
                            return (dynamic_cast<const ngraph::op::ConvolutionBiasAdd*>(node))
                                ->with_relu();
                        }
+                        if (dynamic_cast<const ngraph::op::ConvolutionAdd*>(node))
+                        {
+                            return (dynamic_cast<const ngraph::op::ConvolutionAdd*>(node))
+                                ->with_relu();
+                        }
                        if (dynamic_cast<const ngraph::op::ConvolutionRelu*>(node))
                        {
                            return true;

--- a/src/ngraph/runtime/cpu/op/conv_add.cpp
+++ b/src/ngraph/runtime/cpu/op/conv_add.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <numeric>
+#include "conv_add.hpp"
+#include "ngraph/op/convolution.hpp"
+#include "ngraph/op/get_output_element.hpp"
+#include "ngraph/util.hpp"
+using namespace std;
+using namespace ngraph;
+void op::util::validate_conv_shapes(const Node* node,
+                                    const Shape& data_shape,
+                                    const Shape& filters_shape)
+{
+    NODE_VALIDATION_ASSERT(node, data_shape[1] == filters_shape[1])
+        << "Number of channels for data and filters do not match (data num channels: "
+        << data_shape[1] << ", filters num channels: " << filters_shape[1] << ").";
+}
+op::ConvolutionAdd::ConvolutionAdd(const std::shared_ptr<op::Convolution>& conv,
+                                   const std::shared_ptr<Node>& sum_input,
+                                   bool with_relu)
+    : Op("ConvolutionAdd",
+         check_single_output_args({conv->get_argument(0), conv->get_argument(1), sum_input}))
+    , m_window_movement_strides(conv->get_window_movement_strides())
+    , m_window_dilation_strides(conv->get_window_dilation_strides())
+    , m_padding_below(conv->get_padding_below())
+    , m_padding_above(conv->get_padding_above())
+    , m_data_dilation_strides(conv->get_data_dilation_strides())
+    , m_with_relu(with_relu)
+{
+    constructor_validate_and_infer_types();
+    util::validate_conv_shapes(
+        this, conv->get_argument(0)->get_shape(), conv->get_argument(1)->get_shape());
+    set_output_type(0, conv->get_element_type(), conv->get_shape());
+}
+op::ConvolutionAdd::ConvolutionAdd(const std::shared_ptr<Node>& data_batch,
+                                   const std::shared_ptr<Node>& filters,
+                                   const std::shared_ptr<Node>& sum_input,
+                                   const Strides& window_movement_strides,
+                                   const Strides& window_dilation_strides,
+                                   const CoordinateDiff& padding_below,
+                                   const CoordinateDiff& padding_above,
+                                   const Strides& data_dilation_strides,
+                                   bool with_relu)
+    : Op("ConvolutionAdd", check_single_output_args({data_batch, filters, sum_input}))
+    , m_window_movement_strides(window_movement_strides)
+    , m_window_dilation_strides(window_dilation_strides)
+    , m_padding_below(padding_below)
+    , m_padding_above(padding_above)
+    , m_data_dilation_strides(data_dilation_strides)
+    , m_with_relu(with_relu)
+{
+    constructor_validate_and_infer_types();
+    auto& data_batch_shape = data_batch->get_shape();
+    auto& data_batch_et = data_batch->get_element_type();
+    auto& filters_shape = filters->get_shape();
+    auto& filters_et = filters->get_element_type();
+    //
+    // Make sure data batch and filter element types match.
+    //
+    NODE_VALIDATION_ASSERT(this, data_batch_et == filters_et)
+        << "Element types for data_batch and filters do not match (data batch element type: "
+        << data_batch_et << ", filters element type: " << filters_et << ").";
+    util::validate_conv_shapes(this, data_batch_shape, filters_shape);
+    set_output_type(0,
+                    data_batch_et,
+                    util::infer_convolution_output_shape(this,
+                                                         data_batch_shape,
+                                                         filters_shape,
+                                                         window_movement_strides,
+                                                         window_dilation_strides,
+                                                         padding_below,
+                                                         padding_above,
+                                                         data_dilation_strides,
+                                                         0, /* batch_axis_data,              */
+                                                         1, /* input_channel_axis_data,      */
+                                                         1, /* input_channel_axis_filters,   */
+                                                         0, /* output_channel_axis_filters,  */
+                                                         0, /* batch_axis_result,            */
+                                                         1  /* output_channel_axis_result,   */
+                                                         ));
+}
+std::shared_ptr<Node> op::ConvolutionAdd::copy_with_new_args(const NodeVector& new_args) const
+{
+    NODE_VALIDATION_ASSERT(this, new_args.size() != 3)
+        << "New arg size is not 3 (new args size: " << new_args.size() << ").";
+    return std::shared_ptr<Node>(new ConvolutionAdd(new_args.at(0),
+                                                    new_args.at(1),
+                                                    new_args.at(2),
+                                                    get_window_movement_strides(),
+                                                    get_window_dilation_strides(),
+                                                    get_padding_below(),
+                                                    get_padding_above(),
+                                                    get_data_dilation_strides(),
+                                                    m_with_relu));
+}
--- a/src/ngraph/runtime/cpu/op/conv_add.hpp
+++ b/src/ngraph/runtime/cpu/op/conv_add.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include "ngraph/op/convolution.hpp"
+#include "ngraph/op/op.hpp"
+namespace ngraph
+{
+    namespace op
+    {
+        class ConvolutionAdd : public Op
+        {
+        public:
+            ConvolutionAdd(const std::shared_ptr<op::Convolution>& conv,
+                           const std::shared_ptr<Node>& sum_input,
+                           bool with_relu);
+            ConvolutionAdd(const std::shared_ptr<Node>& data_batch,
+                           const std::shared_ptr<Node>& filters,
+                           const std::shared_ptr<Node>& sum_input,
+                           const Strides& window_movement_strides,
+                           const Strides& window_dilation_strides,
+                           const CoordinateDiff& padding_below,
+                           const CoordinateDiff& padding_above,
+                           const Strides& data_dilation_strides,
+                           bool with_relu);
+            const Strides& get_window_movement_strides() const { return m_window_movement_strides; }
+            const Strides& get_window_dilation_strides() const { return m_window_dilation_strides; }
+            const CoordinateDiff& get_padding_below() const { return m_padding_below; }
+            const CoordinateDiff& get_padding_above() const { return m_padding_above; }
+            const Strides& get_data_dilation_strides() const { return m_data_dilation_strides; }
+            std::shared_ptr<Node> get_filters() { return get_argument(1); }
+            std::shared_ptr<Node> get_data_batch() { return get_argument(0); }
+            bool with_relu() const { return m_with_relu; }
+            virtual std::shared_ptr<Node>
+                copy_with_new_args(const NodeVector& new_args) const override;
+        protected:
+            Strides m_window_movement_strides;
+            Strides m_window_dilation_strides;
+            CoordinateDiff m_padding_below;
+            CoordinateDiff m_padding_above;
+            Strides m_data_dilation_strides;
+            bool m_with_relu;
+        };
+        namespace util
+        {
+            void validate_conv_shapes(const Node* node,
+                                      const Shape& data_shape,
+                                      const Shape& filters_shape);
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
@@ -38,6 +38,7 @@
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
 #include "ngraph/runtime/cpu/op/batch_norm_relu.hpp"
 #include "ngraph/runtime/cpu/op/bounded_relu.hpp"
+#include "ngraph/runtime/cpu/op/conv_add.hpp"
 #include "ngraph/runtime/cpu/op/conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/dequantize.hpp"
@@ -220,6 +221,33 @@ namespace ngraph
                    }
                }
+                template <>
+                void CPUAssignment::ASSIGN_DECL(ngraph::op::ConvolutionAdd)
+                {
+                    auto convolution = static_cast<op::ConvolutionAdd*>(node);
+                    auto arg0_rank = node->get_input_shape(0).size();
+                    auto arg1_rank = node->get_input_shape(1).size();
+                    bool data_dilated = false;
+                    for (size_t s : convolution->get_data_dilation_strides())
+                    {
+                        data_dilated = data_dilated || (s != 1);
+                    }
+                    if (!data_dilated && arg0_rank == 4 && arg1_rank == 4 &&
+                        node->get_input_element_type(0) == element::f32)
+                    {
+                        auto op_annotations =
+                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                        op_annotations->set_mkldnn_op(true);
+                        const int ADD_INPUT = 2;
+                        // Accumulates conv into the second input of the unfused add
+                        op_annotations->add_in_place_oi_pair({0, ADD_INPUT, true});
+                        convolution->set_op_annotations(op_annotations);
+                    }
+                }
                template <>
                void CPUAssignment::ASSIGN_DECL(ngraph::op::BatchNormRelu)
                {
@@ -727,6 +755,8 @@ static const runtime::cpu::pass::AssignOpMap s_dispatcher{
    {TI(ngraph::op::Lstm), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Lstm>},
    {TI(ngraph::op::Rnn), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Rnn>},
    {TI(ngraph::op::Softmax), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Softmax>},
+    {TI(ngraph::op::ConvolutionAdd),
+     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionAdd>},
    {TI(ngraph::op::Dequantize),
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Dequantize>},
 };

--- a/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
@@ -50,6 +50,7 @@
 #include "ngraph/pattern/op/skip.hpp"
 #include "ngraph/runtime/cpu/op/batch_norm_relu.hpp"
 #include "ngraph/runtime/cpu/op/bounded_relu.hpp"
+#include "ngraph/runtime/cpu/op/conv_add.hpp"
 #include "ngraph/runtime/cpu/op/conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/matmul_bias.hpp"
@@ -993,6 +994,143 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_conv_bias_relu()
    this->add_matcher(m);
 }
+void ngraph::runtime::cpu::pass::CPUFusion::construct_conv_add()
+{
+    Shape shape{2, 2, 1, 1};
+    auto data_batch = std::make_shared<pattern::op::Label>(element::f32, shape);
+    auto filters = std::make_shared<pattern::op::Label>(element::f32, shape);
+    auto pconv = std::make_shared<op::Convolution>(data_batch,
+                                                   filters,
+                                                   Strides{1, 1},
+                                                   Strides{1, 1},
+                                                   CoordinateDiff{0, 0},
+                                                   CoordinateDiff{0, 0},
+                                                   Strides{1, 1});
+    auto add_input = std::make_shared<pattern::op::Label>(element::f32, pconv->get_shape());
+    auto padd = std::make_shared<op::Add>(add_input, pconv);
+    pattern::graph_rewrite_callback callback = [data_batch, filters](pattern::Matcher& m) {
+        NGRAPH_DEBUG << "In a callback for construct_conv_add against "
+                     << m.get_match_root()->get_name();
+        auto add_m = m.get_match_root();
+        auto pattern_map = m.get_pattern_map();
+        auto conv_m = std::dynamic_pointer_cast<op::Convolution>(add_m->get_argument(1));
+        auto inplace_input = add_m->get_argument(0);
+        if (!conv_m)
+        {
+            conv_m = std::dynamic_pointer_cast<op::Convolution>(add_m->get_argument(0));
+            inplace_input = add_m->get_argument(1);
+        }
+        //These checks are to make sure a MKLDNN Convolution kernel can be used.
+        bool data_dilated = false;
+        for (size_t s : conv_m->get_data_dilation_strides())
+        {
+            data_dilated = data_dilated || (s != 1);
+        }
+        if (data_dilated)
+        {
+            NGRAPH_DEBUG << "Convolution has dilations greater than 1";
+            return false;
+        }
+        if (conv_m->get_element_type() != element::f32)
+        {
+            NGRAPH_DEBUG << "Convolution isn't of type float";
+            return false;
+        }
+        auto arg0_rank = conv_m->get_input_shape(0).size();
+        auto arg1_rank = conv_m->get_input_shape(1).size();
+        if (arg0_rank != 4 || arg1_rank != 4)
+        {
+            NGRAPH_DEBUG << "Convolution's arguments ranks aren't equal to 4";
+            return false;
+        }
+        if (get_user_count(conv_m.get()) > 1)
+        {
+            NGRAPH_DEBUG << "Convolution has more than one user";
+            return false;
+        }
+        if (!is_post_dominated(inplace_input.get(), add_m.get()))
+        {
+            NGRAPH_DEBUG << "Unsafe to use in-place kernel since add's in-place input has "
+                            "potential live users";
+            return false;
+        }
+        if (inplace_input->is_parameter())
+        {
+            NGRAPH_DEBUG
+                << "Unsafe to use in-place kernel since add's in-place input is a parameter";
+            return false;
+        }
+        auto conv_add = std::shared_ptr<Node>(new op::ConvolutionAdd(conv_m, inplace_input, false));
+        ngraph::replace_node(m.get_match_root(), conv_add);
+        return true;
+    };
+    auto m = std::make_shared<pattern::Matcher>(padd, callback, "conv_add");
+    this->add_matcher(m);
+}
+void ngraph::runtime::cpu::pass::CPUFusion::construct_conv_add_relu()
+{
+    Shape shape{2, 2, 1, 1};
+    auto data_batch = std::make_shared<pattern::op::Label>(element::f32, shape);
+    auto filters = std::make_shared<pattern::op::Label>(element::f32, shape);
+    auto add_input = std::make_shared<pattern::op::Label>(element::f32, shape);
+    auto pconv = std::make_shared<op::ConvolutionAdd>(data_batch,
+                                                      filters,
+                                                      add_input,
+                                                      Strides{1, 1},
+                                                      Strides{1, 1},
+                                                      CoordinateDiff{0, 0},
+                                                      CoordinateDiff{0, 0},
+                                                      Strides{1, 1},
+                                                      false);
+    auto prelu = std::make_shared<op::Relu>(pconv);
+    pattern::graph_rewrite_callback callback = [](pattern::Matcher& m) {
+        NGRAPH_DEBUG << "In a callback for construct_conv_add_relu against "
+                     << m.get_match_root()->get_name();
+        auto conv_m =
+            std::dynamic_pointer_cast<op::ConvolutionAdd>(m.get_match_root()->get_argument(0));
+        if (conv_m->get_users().size() > 1)
+        {
+            NGRAPH_DEBUG << "Convolution has more than one user";
+            return false;
+        }
+        // ConvolutionAdd created only if it can run with MKLDNN.
+        // No further checks needed.
+        auto conv_n = std::make_shared<op::ConvolutionAdd>(conv_m->get_argument(0),
+                                                           conv_m->get_argument(1),
+                                                           conv_m->get_argument(2),
+                                                           conv_m->get_window_movement_strides(),
+                                                           conv_m->get_window_dilation_strides(),
+                                                           conv_m->get_padding_below(),
+                                                           conv_m->get_padding_above(),
+                                                           conv_m->get_data_dilation_strides(),
+                                                           true);
+        ngraph::replace_node(m.get_match_root(), conv_n);
+        return true;
+    };
+    auto m = std::make_shared<pattern::Matcher>(prelu, callback, "conv_add_relu");
+    this->add_matcher(m);
+}
 void ngraph::runtime::cpu::pass::CPUFusion::construct_conv_bias_add()
 {
    Shape shape{2, 2, 1, 1};
@@ -1074,17 +1212,6 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_conv_bias_add()
            return false;
        }
-        for (auto add_user : m.get_match_root()->get_users())
-        {
-            if (add_user->is_output())
-            {
-                // TODO: Remove restriction once we handle this case in codegen
-                NGRAPH_DEBUG
-                    << "Unsafe to use in-place kernel since add's in-place output is a result";
-                return false;
-            }
-        }
        auto conv_add =
            std::shared_ptr<Node>(new op::ConvolutionBiasAdd(conv_m, inplace_input, false));
        ngraph::replace_node(m.get_match_root(), conv_add);

--- a/src/ngraph/runtime/cpu/pass/cpu_fusion.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_fusion.hpp
@@ -49,6 +49,12 @@ public:
    CPUFusion(int fusions = ALL)
        : GraphRewrite()
    {
+        if (fusions & DIFFERENTIABLE_FUSIONS)
+        {
+            construct_conv_bias();
+            construct_sigmoid_multiply();
+        }
        if (fusions & REGULAR_FUSIONS)
        {
            construct_matmul();
@@ -65,12 +71,9 @@ public:
            construct_conv_bias_add();
            construct_conv_bias_add_relu();
            construct_bounded_relu();
-        }
+            // construct_conv_add() should always be after construct_conv_bias()
+            construct_conv_add();
-        if (fusions & DIFFERENTIABLE_FUSIONS)
+            construct_conv_add_relu();
-        {
-            construct_conv_bias();
-            construct_sigmoid_multiply();
        }
    }
@@ -90,5 +93,7 @@ private:
    void construct_conv_bias_relu();
    void construct_conv_bias_add();
    void construct_conv_bias_add_relu();
+    void construct_conv_add();
+    void construct_conv_add_relu();
    void construct_bounded_relu();
 };
--- a/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
@@ -46,6 +46,7 @@
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
 #include "ngraph/runtime/cpu/op/batch_norm_relu.hpp"
 #include "ngraph/runtime/cpu/op/bounded_relu.hpp"
+#include "ngraph/runtime/cpu/op/conv_add.hpp"
 #include "ngraph/runtime/cpu/op/conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/convert_layout.hpp"
@@ -475,6 +476,26 @@ namespace ngraph
                    }
                }
+                template <>
+                void CPULayout::LAYOUT_DECL(ngraph::op::ConvolutionAdd)
+                {
+                    if (mkldnn_utils::use_mkldnn_kernel(node.get()))
+                    {
+                        vector<memory::desc> i_mds;
+                        vector<memory::desc> o_mds;
+                        ConvolutionLayout<ngraph::op::ConvolutionAdd, false, false>(
+                            node, i_mds, o_mds);
+                        // Force second input to sum to use the same layout as convolution output
+                        i_mds.push_back(o_mds[0]);
+                        node = insert_input_conversions(external_function, node, i_mds);
+                        set_output_layouts(node, o_mds);
+                    }
+                    else
+                    {
+                        throw ngraph_error("ConvolutionAdd only supported in MKLDNN for now");
+                    }
+                }
                template <>
                void CPULayout::LAYOUT_DECL(ngraph::op::ConvolutionBackpropData)
                {
@@ -1608,6 +1629,8 @@ static const runtime::cpu::pass::LayoutOpMap s_dispatcher{
    {TI(ngraph::op::Rnn), &runtime::cpu::pass::CPULayout::layout<ngraph::op::Rnn>},
    {TI(ngraph::op::Softmax), &runtime::cpu::pass::CPULayout::layout<ngraph::op::Softmax>},
    {TI(ngraph::op::BoundedRelu), &runtime::cpu::pass::CPULayout::layout<ngraph::op::BoundedRelu>},
+    {TI(ngraph::op::ConvolutionAdd),
+     &runtime::cpu::pass::CPULayout::layout<ngraph::op::ConvolutionAdd>},
    {TI(ngraph::op::Dequantize), &runtime::cpu::pass::CPULayout::layout<ngraph::op::Dequantize>},
 };

--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp
@@ -50,6 +50,7 @@
 #include "ngraph/runtime/cpu/op/batch_dot.hpp"
 #include "ngraph/runtime/cpu/op/batch_norm_relu.hpp"
 #include "ngraph/runtime/cpu/op/bounded_relu.hpp"
+#include "ngraph/runtime/cpu/op/conv_add.hpp"
 #include "ngraph/runtime/cpu/op/conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/convert_layout.hpp"
@@ -924,7 +925,7 @@ TEST(cpu_fusion, fuse_conv_bias_add)
    ASSERT_EQ(count_ops_of_type<op::ConvolutionBiasAdd>(func_nofuse1), 0);
    pass_manager.run_passes(func_nofuse2);
-    ASSERT_EQ(count_ops_of_type<op::ConvolutionBiasAdd>(func_nofuse2), 0);
+    ASSERT_EQ(count_ops_of_type<op::ConvolutionBiasAdd>(func_nofuse2), 1);
 }
 TEST(cpu_fusion, conv_bias_add)
@@ -942,6 +943,61 @@ TEST(cpu_fusion, conv_bias_add)
    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
 }
+// ConvolutionAdd relies on an in-place fused MKLDNN kernel.
+// Need to ensure that it is fused only when in-place buffer allocation is feasible
+shared_ptr<Function> gen_conv_add(bool param_input, bool result_output)
+{
+    auto A = make_shared<op::Parameter>(element::f32, Shape{2, 1, 2, 2});
+    auto weights = make_shared<op::Parameter>(element::f32, Shape{1, 1, 1, 1});
+    auto conv = make_shared<op::Convolution>(A, weights, Strides{1, 1}, Strides{1, 1});
+    auto B = make_shared<op::Parameter>(element::f32, Shape{2, 1, 2, 2});
+    auto abs_B = make_shared<op::Abs>(B);
+    auto add = param_input ? make_shared<op::Add>(conv, B) : make_shared<op::Add>(conv, abs_B);
+    auto abs = make_shared<op::Abs>(add);
+    return result_output ? make_shared<Function>(add, op::ParameterVector{A, weights, B})
+                         : make_shared<Function>(abs, op::ParameterVector{A, weights, B});
+}
+TEST(cpu_fusion, fuse_conv_add)
+{
+    auto func_fuse = gen_conv_add(false, false);
+    auto func_nofuse1 = gen_conv_add(true, false);
+    auto func_nofuse2 = gen_conv_add(false, true);
+    pass::Manager pass_manager;
+    pass_manager.register_pass<runtime::cpu::pass::CPUFusion>();
+    pass_manager.run_passes(func_fuse);
+    ASSERT_EQ(count_ops_of_type<op::ConvolutionAdd>(func_fuse), 1);
+    pass_manager.run_passes(func_nofuse1);
+    ASSERT_EQ(count_ops_of_type<op::ConvolutionAdd>(func_nofuse1), 0);
+    pass_manager.run_passes(func_nofuse2);
+    ASSERT_EQ(count_ops_of_type<op::ConvolutionAdd>(func_nofuse2), 1);
+}
+TEST(cpu_fusion, conv_add)
+{
+    auto int_f = gen_conv_add(false, false);
+    auto cpu_f = gen_conv_add(false, false);
+    vector<vector<float>> args{{1.25f, 2.25f, 5.25f, 6.25f, -1.25f, -1.25f, 3.25f, -4.25f},
+                               {-1.25f},
+                               {1.25f, 2.25f, -3.25f, 2.25f, 4.25f, 4.25f, 1.25f, 2.25f}};
+    auto int_results = execute(int_f, args, "INTERPRETER");
+    auto cpu_results = execute(cpu_f, args, "CPU");
+    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
+    int_f = gen_conv_add(false, true);
+    cpu_f = gen_conv_add(false, true);
+    int_results = execute(int_f, args, "INTERPRETER");
+    cpu_results = execute(cpu_f, args, "CPU");
+    EXPECT_TRUE(test::all_close(cpu_results.at(0), int_results.at(0)));
+}
 std::vector<shared_ptr<runtime::TensorView>>
    rnn_matrix_fusion_eval(const size_t time_steps,
                           const Shape& data_shape,