Add Quantized conv+relu (#1664)

4dabd001 · Nishant Patel · Robert Kimball · f8a084ac · 4dabd001 · 4dabd001
Commit 4dabd001 authored Sep 26, 2018 by Nishant Patel Committed by Robert Kimball Sep 26, 2018
13 changed files
--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -27,7 +27,6 @@ set(SRC
    cpu_tensor_view.cpp
    cpu_tracing.cpp
    cpu_visualize_tree.cpp
-    quantization_util.cpp
    builder/add.cpp
    builder/allreduce.cpp
    builder/avg_pool.cpp
@@ -95,6 +94,7 @@ set(SRC
    op/max_pool_with_indices.cpp
    op/quantized_max_pool.cpp
    op/quantized_avg_pool.cpp
+    op/quantized_conv_relu.cpp
    op/rnn.cpp
    op/sigmoid_mul.cpp
    op/conv_add.cpp

--- a/src/ngraph/runtime/cpu/builder/quantized_conv.cpp
+++ b/src/ngraph/runtime/cpu/builder/quantized_conv.cpp
@@ -19,6 +19,7 @@
 #include "ngraph/runtime/cpu/cpu_builder.hpp"
 #include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
 using namespace std;
 using namespace ngraph;
@@ -67,7 +68,49 @@ namespace ngraph
                    throw ngraph_error("unsupported parameters for QuantizedConvolution via DEX");
                }
            }
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::QuantizedConvolutionRelu)
+            {
+                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
+                {
+                    auto qconvolution_relu =
+                        static_cast<const ngraph::op::QuantizedConvolutionRelu*>(node);
+                    auto& functors = external_function->get_functors();
+                    auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name());
+                    auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
+                    auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
+                    auto& out1_tensor = external_function->get_tensor_data(out[1].get_name());
+                    auto& out2_tensor = external_function->get_tensor_data(out[2].get_name());
+                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                    auto conv_index =
+                        mkldnn_emitter->build_convolution<ngraph::op::QuantizedConvolutionRelu>(
+                            node, args, out);
+                    auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
+                    float min_freezed_output = qconvolution_relu->get_freezed_output_min();
+                    float max_freezed_output = qconvolution_relu->get_freezed_output_max();
+                    auto functor = [&, conv_index, min_freezed_output, max_freezed_output](
+                        CPURuntimeContext* ctx) {
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out_tensor);
+                        *(static_cast<float*>(out1_tensor)) = min_freezed_output;
+                        *(static_cast<float*>(out2_tensor)) = max_freezed_output;
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, conv_index);
+                    };
+                    functors.emplace_back(functor);
+                }
+                else
+                {
+                    throw ngraph_error(
+                        "unsupported parameters for QuantizedConvolutionRelu via DEX");
+                }
+            }
            REGISTER_OP_BUILDER(QuantizedConvolution);
+            REGISTER_OP_BUILDER(QuantizedConvolutionRelu);
        }
    }
 }
--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -109,6 +109,7 @@
 #include "ngraph/runtime/cpu/op/matmul_bias.hpp"
 #include "ngraph/runtime/cpu/op/max_pool_with_indices.hpp"
 #include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
+#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
 #include "ngraph/runtime/cpu/op/rnn.hpp"
 #include "ngraph/runtime/cpu/op/sigmoid.hpp"
@@ -2657,6 +2658,38 @@ namespace ngraph
                }
            }
+            template <>
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::QuantizedConvolutionRelu)
+            {
+                auto qconvolution_relu =
+                    static_cast<const ngraph::op::QuantizedConvolutionRelu*>(node);
+                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
+                {
+                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                    auto conv_index =
+                        mkldnn_emitter->build_convolution<ngraph::op::QuantizedConvolutionRelu>(
+                            node, args, out);
+                    auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
+                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0])
+                           << ", " << args[0].get_name() << ");\n";
+                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1])
+                           << ", " << args[1].get_name() << ");\n";
+                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[2])
+                           << ", " << out[0].get_name() << ");\n";
+                    writer << "*(" << out[1].get_name()
+                           << ") = " << qconvolution_relu->get_freezed_output_min() << ";\n";
+                    writer << "*(" << out[2].get_name()
+                           << ") = " << qconvolution_relu->get_freezed_output_max() << ";\n";
+                    writer << "cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, "
+                           << to_string(conv_index) << ");\n";
+                }
+                else
+                {
+                    throw ngraph_error("unsupported parameters for QuantizedConvolutionRelu");
+                }
+            }
            template <>
            void CPU_Emitter::EMITTER_DECL(ngraph::op::QuantizedConvolution)
            {

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -152,6 +152,7 @@
 #include "ngraph/runtime/cpu/op/quantize.hpp"
 #include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
 #include "ngraph/runtime/cpu/op/quantized_conv.hpp"
+#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
 #include "ngraph/runtime/cpu/op/rnn.hpp"
 #include "ngraph/runtime/cpu/op/sigmoid.hpp"
@@ -310,6 +311,8 @@ static const runtime::cpu::OpMap dispatcher{
    {TI(ngraph::op::ConvolutionRelu), &runtime::cpu::CPU_Emitter::emit<op::ConvolutionRelu>},
    {TI(ngraph::op::QuantizedConvolution),
     &runtime::cpu::CPU_Emitter::emit<op::QuantizedConvolution>},
+    {TI(ngraph::op::QuantizedConvolutionRelu),
+     &runtime::cpu::CPU_Emitter::emit<op::QuantizedConvolutionRelu>},
    {TI(ngraph::op::ConvolutionBiasAdd), &runtime::cpu::CPU_Emitter::emit<op::ConvolutionBiasAdd>},
    // conv+bias backprop for data share the same implementation as ConvolutionBackpropData
    {TI(ngraph::op::ConvolutionBiasBackpropFiltersBias),

--- a/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
@@ -281,7 +281,8 @@ size_t MKLDNNEmitter::build_quantized_convolution(const mkldnn::memory::desc& in
                                                  const ngraph::Strides& dilation_strides,
                                                  const ngraph::CoordinateDiff& padding_below,
                                                  const ngraph::CoordinateDiff& padding_above,
-                                                  const float scale)
+                                                  const float scale,
+                                                  const mkldnn::post_ops& pops)
 {
    size_t input_data_index = build_memory_primitive(input_data_desc);
    size_t weights_index = build_memory_primitive(weights_desc);
@@ -289,6 +290,7 @@ size_t MKLDNNEmitter::build_quantized_convolution(const mkldnn::memory::desc& in
    std::vector<float> output_scale;
    output_scale.push_back(scale);
    mkldnn::primitive_attr conv_attr;
+    conv_attr.set_post_ops(pops);
    /* Specify the rounding mode */
    conv_attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
    /* Specify the scales array and corresponding mask */

--- a/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
@@ -33,6 +33,7 @@
 #include "ngraph/runtime/cpu/op/conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/quantized_conv.hpp"
+#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
 #include "ngraph/runtime/cpu/quantization_util.hpp"
 #include "ngraph/shape.hpp"
 #include "ngraph/strides.hpp"
@@ -106,14 +107,16 @@ namespace ngraph
                                                 const ngraph::CoordinateDiff& padding_above,
                                                 const mkldnn::post_ops& pops = mkldnn::post_ops());
-                size_t build_quantized_convolution(const mkldnn::memory::desc& input_data_desc,
+                size_t
-                                                   const mkldnn::memory::desc& weights_desc,
+                    build_quantized_convolution(const mkldnn::memory::desc& input_data_desc,
-                                                   const mkldnn::memory::desc& result_desc,
+                                                const mkldnn::memory::desc& weights_desc,
-                                                   const ngraph::Strides& strides,
+                                                const mkldnn::memory::desc& result_desc,
-                                                   const ngraph::Strides& dilation_strides,
+                                                const ngraph::Strides& strides,
-                                                   const ngraph::CoordinateDiff& padding_below,
+                                                const ngraph::Strides& dilation_strides,
-                                                   const ngraph::CoordinateDiff& padding_above,
+                                                const ngraph::CoordinateDiff& padding_below,
-                                                   const float scale);
+                                                const ngraph::CoordinateDiff& padding_above,
+                                                const float scale,
+                                                const mkldnn::post_ops& pops = mkldnn::post_ops());
                template <typename OP>
                size_t build_convolution(const ngraph::Node* node,
@@ -170,6 +173,10 @@ namespace ngraph
                        {
                            return true;
                        }
+                        if (dynamic_cast<const ngraph::op::QuantizedConvolutionRelu*>(node))
+                        {
+                            return true;
+                        }
                        return false;
                    };
@@ -198,7 +205,24 @@ namespace ngraph
                    }
                    else if (std::is_same<OP, ngraph::op::QuantizedConvolution>())
                    {
-                        const float scale = quantization_util::get_scale(node);
+                        const float scale =
+                            quantization_util::get_scale<ngraph::op::QuantizedConvolution>(node);
+                        return build_quantized_convolution(
+                            data_desc,
+                            weights_desc,
+                            result_desc,
+                            convolution->get_window_movement_strides(),
+                            window_dilation_strides_adjusted,
+                            convolution->get_padding_below(),
+                            convolution->get_padding_above(),
+                            scale,
+                            ops);
+                    }
+                    else if (std::is_same<OP, ngraph::op::QuantizedConvolutionRelu>())
+                    {
+                        const float scale =
+                            quantization_util::get_scale<ngraph::op::QuantizedConvolutionRelu>(
+                                node);
                        return build_quantized_convolution(
                            data_desc,
                            weights_desc,
@@ -207,7 +231,8 @@ namespace ngraph
                            window_dilation_strides_adjusted,
                            convolution->get_padding_below(),
                            convolution->get_padding_above(),
-                            scale);
+                            scale,
+                            ops);
                    }
                    else
                    {

--- a/src/ngraph/runtime/cpu/op/quantized_conv_relu.cpp
+++ b/src/ngraph/runtime/cpu/op/quantized_conv_relu.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include <numeric>
+#include "ngraph/op/constant.hpp"
+#include "ngraph/op/get_output_element.hpp"
+#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
+#include "ngraph/util.hpp"
+using namespace std;
+using namespace ngraph;
+op::QuantizedConvolutionRelu::QuantizedConvolutionRelu(
+    const std::shared_ptr<op::QuantizedConvolution>& qconv)
+    : Op("QuantizedConvolutionRelu",
+         check_single_output_args({qconv->get_argument(0),
+                                   qconv->get_argument(1),
+                                   qconv->get_argument(2),
+                                   qconv->get_argument(3),
+                                   qconv->get_argument(4),
+                                   qconv->get_argument(5),
+                                   qconv->get_argument(6),
+                                   qconv->get_argument(7)}))
+    , m_window_movement_strides(qconv->get_window_movement_strides())
+    , m_window_dilation_strides(qconv->get_window_dilation_strides())
+    , m_padding_below(qconv->get_padding_below())
+    , m_padding_above(qconv->get_padding_above())
+    , m_data_dilation_strides(qconv->get_data_dilation_strides())
+{
+    constructor_validate_and_infer_types();
+    this->m_input_min = qconv->get_input_min();
+    this->m_input_max = qconv->get_input_max();
+    this->m_filter_min = qconv->get_filter_min();
+    this->m_filter_max = qconv->get_filter_max();
+    this->m_freezed_output_min = qconv->get_freezed_output_min();
+    this->m_freezed_output_max = qconv->get_freezed_output_max();
+    set_output_size(3);
+    set_output_type(0, element::u8, qconv->get_shape());
+    set_output_type(1, element::f32, Shape{1});
+    set_output_type(2, element::f32, Shape{1});
+}
+op::QuantizedConvolutionRelu::QuantizedConvolutionRelu(
+    const std::shared_ptr<Node>& data_batch,
+    const std::shared_ptr<Node>& filters,
+    const Strides& window_movement_strides,
+    const Strides& window_dilation_strides,
+    const CoordinateDiff& padding_below,
+    const CoordinateDiff& padding_above,
+    const Strides& data_dilation_strides,
+    const std::shared_ptr<Node> min_input,
+    const std::shared_ptr<Node> max_input,
+    const std::shared_ptr<Node> min_filter,
+    const std::shared_ptr<Node> max_filter,
+    const std::shared_ptr<Node> min_freezed_output,
+    const std::shared_ptr<Node> max_freezed_output)
+    : Op("QuantizedConvolutionRelu",
+         check_single_output_args({data_batch,
+                                   filters,
+                                   min_input,
+                                   max_input,
+                                   min_filter,
+                                   max_filter,
+                                   min_freezed_output,
+                                   max_freezed_output}))
+    , m_window_movement_strides(window_movement_strides)
+    , m_window_dilation_strides(window_dilation_strides)
+    , m_padding_below(padding_below)
+    , m_padding_above(padding_above)
+    , m_data_dilation_strides(data_dilation_strides)
+{
+    constructor_validate_and_infer_types();
+    auto& data_batch_shape = data_batch->get_shape();
+    auto& filters_shape = filters->get_shape();
+    auto min_input_const_op = std::static_pointer_cast<ngraph::op::Constant>(min_input);
+    auto max_input_const_op = std::static_pointer_cast<ngraph::op::Constant>(max_input);
+    auto min_filter_const_op = std::static_pointer_cast<ngraph::op::Constant>(min_filter);
+    auto max_filter_const_op = std::static_pointer_cast<ngraph::op::Constant>(max_filter);
+    auto min_freezed_output_const_op =
+        std::static_pointer_cast<ngraph::op::Constant>(min_freezed_output);
+    auto max_freezed_output_const_op =
+        std::static_pointer_cast<ngraph::op::Constant>(max_freezed_output);
+    float input_min = *(static_cast<float const*>(min_input_const_op->get_data_ptr()));
+    float input_max = *(static_cast<float const*>(max_input_const_op->get_data_ptr()));
+    float filter_min = *(static_cast<float const*>(min_filter_const_op->get_data_ptr()));
+    float filter_max = *(static_cast<float const*>(max_filter_const_op->get_data_ptr()));
+    float output_min = *(static_cast<float const*>(min_freezed_output_const_op->get_data_ptr()));
+    float output_max = *(static_cast<float const*>(max_freezed_output_const_op->get_data_ptr()));
+    this->m_input_min = input_min;
+    this->m_input_max = input_max;
+    this->m_filter_min = filter_min;
+    this->m_filter_max = filter_max;
+    this->m_freezed_output_min = output_min;
+    this->m_freezed_output_max = output_max;
+    set_output_size(3);
+    set_output_type(0,
+                    element::u8,
+                    util::infer_convolution_output_shape(this,
+                                                         data_batch_shape,
+                                                         filters_shape,
+                                                         window_movement_strides,
+                                                         window_dilation_strides,
+                                                         padding_below,
+                                                         padding_above,
+                                                         data_dilation_strides,
+                                                         0, /* batch_axis_data,              */
+                                                         1, /* input_channel_axis_data,      */
+                                                         1, /* input_channel_axis_filters,   */
+                                                         0, /* output_channel_axis_filters,  */
+                                                         0, /* batch_axis_result,            */
+                                                         1  /* output_channel_axis_result,   */
+                                                         ));
+    set_output_type(1, element::f32, Shape{1});
+    set_output_type(2, element::f32, Shape{1});
+}
+std::shared_ptr<Node>
+    op::QuantizedConvolutionRelu::copy_with_new_args(const NodeVector& new_args) const
+{
+    if (new_args.size() != 8)
+    {
+        throw ngraph_error("Incorrect number of new arguments");
+    }
+    return std::shared_ptr<Node>(new QuantizedConvolutionRelu(new_args.at(0),
+                                                              new_args.at(1),
+                                                              get_window_movement_strides(),
+                                                              get_window_dilation_strides(),
+                                                              get_padding_below(),
+                                                              get_padding_above(),
+                                                              get_data_dilation_strides(),
+                                                              new_args.at(2),
+                                                              new_args.at(3),
+                                                              new_args.at(4),
+                                                              new_args.at(5),
+                                                              new_args.at(6),
+                                                              new_args.at(7)));
+}
--- a/src/ngraph/runtime/cpu/op/quantized_conv_relu.hpp
+++ b/src/ngraph/runtime/cpu/op/quantized_conv_relu.hpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include "ngraph/op/op.hpp"
+#include "ngraph/runtime/cpu/op/conv_bias.hpp"
+#include "ngraph/runtime/cpu/op/quantized_conv.hpp"
+namespace ngraph
+{
+    namespace op
+    {
+        /// \brief Relu(Convolution) forward prop for batched convolution operation.
+        class QuantizedConvolutionRelu : public Op
+        {
+        public:
+            QuantizedConvolutionRelu(const std::shared_ptr<op::QuantizedConvolution>& qconv);
+            QuantizedConvolutionRelu(const std::shared_ptr<Node>& data_batch,
+                                     const std::shared_ptr<Node>& filters,
+                                     const Strides& window_movement_strides,
+                                     const Strides& window_dilation_strides,
+                                     const CoordinateDiff& padding_below,
+                                     const CoordinateDiff& padding_above,
+                                     const Strides& data_dilation_strides,
+                                     const std::shared_ptr<Node> min_input,
+                                     const std::shared_ptr<Node> max_input,
+                                     const std::shared_ptr<Node> min_filter,
+                                     const std::shared_ptr<Node> max_filter,
+                                     const std::shared_ptr<Node> min_freezed_output,
+                                     const std::shared_ptr<Node> max_freezed_output);
+            const Strides& get_window_movement_strides() const { return m_window_movement_strides; }
+            const Strides& get_window_dilation_strides() const { return m_window_dilation_strides; }
+            const CoordinateDiff& get_padding_below() const { return m_padding_below; }
+            const CoordinateDiff& get_padding_above() const { return m_padding_above; }
+            const Strides& get_data_dilation_strides() const { return m_data_dilation_strides; }
+            float get_input_min() const { return m_input_min; }
+            float get_input_max() const { return m_input_max; }
+            float get_filter_min() const { return m_filter_min; }
+            float get_filter_max() const { return m_filter_max; }
+            float get_freezed_output_min() const { return m_freezed_output_min; }
+            float get_freezed_output_max() const { return m_freezed_output_max; }
+            std::shared_ptr<Node> get_filters() { return get_argument(1); }
+            std::shared_ptr<Node> get_data_batch() { return get_argument(0); }
+            virtual std::shared_ptr<Node>
+                copy_with_new_args(const NodeVector& new_args) const override;
+        protected:
+            Strides m_window_movement_strides;
+            Strides m_window_dilation_strides;
+            CoordinateDiff m_padding_below;
+            CoordinateDiff m_padding_above;
+            Strides m_data_dilation_strides;
+            float m_input_min;
+            float m_input_max;
+            float m_filter_min;
+            float m_filter_max;
+            float m_freezed_output_min;
+            float m_freezed_output_max;
+        };
+    }
+}
--- a/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
@@ -50,6 +50,7 @@
 #include "ngraph/runtime/cpu/op/quantize.hpp"
 #include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
 #include "ngraph/runtime/cpu/op/quantized_conv.hpp"
+#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
 #include "ngraph/runtime/cpu/op/rnn.hpp"
 #include "ngraph/runtime/cpu/op/sigmoid.hpp"
@@ -759,6 +760,20 @@ namespace ngraph
                    }
                }
+                template <>
+                void CPUAssignment::ASSIGN_DECL(ngraph::op::QuantizedConvolutionRelu)
+                {
+                    if (node->get_input_element_type(0) == element::u8 &&
+                        node->get_input_element_type(1) == element::i8)
+                    {
+                        auto quantized_conv_relu = static_cast<op::QuantizedConvolutionRelu*>(node);
+                        auto op_annotations =
+                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                        op_annotations->set_mkldnn_op(true);
+                        quantized_conv_relu->set_op_annotations(op_annotations);
+                    }
+                }
                template <>
                void CPUAssignment::ASSIGN_DECL(ngraph::op::Quantize)
                {
@@ -838,6 +853,8 @@ static const runtime::cpu::pass::AssignOpMap s_dispatcher{
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionAdd>},
    {TI(ngraph::op::Dequantize),
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Dequantize>},
+    {TI(ngraph::op::QuantizedConvolutionRelu),
+     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::QuantizedConvolutionRelu>},
 };
 bool runtime::cpu::pass::CPUAssignment::run_on_call_graph(

--- a/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
@@ -58,6 +58,7 @@
 #include "ngraph/runtime/cpu/op/quantize.hpp"
 #include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
 #include "ngraph/runtime/cpu/op/quantized_conv.hpp"
+#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
 #include "ngraph/runtime/cpu/op/rnn.hpp"
@@ -499,6 +500,51 @@ namespace ngraph
                    }
                }
+                template <>
+                void CPULayout::LAYOUT_DECL(ngraph::op::QuantizedConvolutionRelu)
+                {
+                    if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node.get()))
+                    {
+                        vector<memory::desc> i_mds;
+                        vector<memory::desc> o_mds;
+                        ConvolutionLayout<ngraph::op::QuantizedConvolutionRelu, false, false>(
+                            node, i_mds, o_mds);
+                        auto min_input_md = mkldnn_utils::create_default_mkldnn_md(
+                            node.get(), 2, false, memory::format::x);
+                        auto max_input_md = mkldnn_utils::create_default_mkldnn_md(
+                            node.get(), 3, false, memory::format::x);
+                        auto min_filter_md = mkldnn_utils::create_default_mkldnn_md(
+                            node.get(), 4, false, memory::format::x);
+                        auto max_filter_md = mkldnn_utils::create_default_mkldnn_md(
+                            node.get(), 5, false, memory::format::x);
+                        auto min_freezed_output_md = mkldnn_utils::create_default_mkldnn_md(
+                            node.get(), 6, false, memory::format::x);
+                        auto max_freezed_output_md = mkldnn_utils::create_default_mkldnn_md(
+                            node.get(), 7, false, memory::format::x);
+                        auto min_output_md = mkldnn_utils::create_default_mkldnn_md(
+                            node.get(), 1, true, memory::format::x);
+                        auto max_output_md = mkldnn_utils::create_default_mkldnn_md(
+                            node.get(), 2, true, memory::format::x);
+                        i_mds.push_back(min_input_md);
+                        i_mds.push_back(max_input_md);
+                        i_mds.push_back(min_filter_md);
+                        i_mds.push_back(max_filter_md);
+                        i_mds.push_back(min_freezed_output_md);
+                        i_mds.push_back(max_freezed_output_md);
+                        o_mds.push_back(min_output_md);
+                        o_mds.push_back(max_output_md);
+                        node = insert_input_conversions(external_function, node, i_mds);
+                        set_output_layouts(node, o_mds);
+                    }
+                    else
+                    {
+                        set_native_layouts(external_function, node);
+                    }
+                }
                template <>
                void CPULayout::LAYOUT_DECL(ngraph::op::ConvolutionBiasAdd)
                {
@@ -1842,6 +1888,8 @@ static const runtime::cpu::pass::LayoutOpMap s_dispatcher{
    {TI(ngraph::op::Dequantize), &runtime::cpu::pass::CPULayout::layout<ngraph::op::Dequantize>},
    {TI(ngraph::op::Slice), &runtime::cpu::pass::CPULayout::layout<ngraph::op::Slice>},
    {TI(ngraph::op::Quantize), &runtime::cpu::pass::CPULayout::layout<ngraph::op::Quantize>},
+    {TI(ngraph::op::QuantizedConvolutionRelu),
+     &runtime::cpu::pass::CPULayout::layout<ngraph::op::QuantizedConvolutionRelu>},
 };
 bool runtime::cpu::pass::CPULayout::run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes)

--- a/src/ngraph/runtime/cpu/quantization_util.cpp
+++ b/src/ngraph/runtime/cpu/quantization_util.cpp
-/*******************************************************************************
-* Copyright 2018 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-#include "quantization_util.hpp"
-#include "ngraph/runtime/cpu/op/quantized_conv.hpp"
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace cpu
-        {
-            namespace quantization_util
-            {
-                float get_scale(const ngraph::Node* node)
-                {
-                    auto qconvolution = static_cast<const ngraph::op::QuantizedConvolution*>(node);
-                    float min_out_value;
-                    float max_out_value;
-                    quantization_range_for_multiplication<uint8_t, int8_t, int32_t>(
-                        qconvolution->get_input_min(),
-                        qconvolution->get_input_max(),
-                        qconvolution->get_filter_min(),
-                        qconvolution->get_filter_max(),
-                        &min_out_value,
-                        &max_out_value);
-                    const float max_abs32 =
-                        std::max(std::abs(min_out_value), std::abs(max_out_value));
-                    const float max_abs8 =
-                        std::max(std::abs(qconvolution->get_freezed_output_min()),
-                                 std::abs(qconvolution->get_freezed_output_max()));
-                    // Output is signed int.
-                    // s32 = f32 * std::pow(2, 31)/ max_abs32;
-                    // s8 = f32 * std::pow(2, 7)/ max_abs8;
-                    // s8 = s32 * std::pow(2, -24) * max_abs32 / max_abs8;
-                    const float scale = static_cast<float>(
-                        (std::pow(2, -24) * static_cast<double>(max_abs32 / max_abs8)));
-                    return scale;
-                }
-            }
-        }
-    }
-}
--- a/src/ngraph/runtime/cpu/quantization_util.hpp
+++ b/src/ngraph/runtime/cpu/quantization_util.hpp
@@ -89,7 +89,32 @@ namespace ngraph
                    quant_util.push_back(scale);
                }
-                float get_scale(const ngraph::Node* node);
+                template <typename OP>
+                float get_scale(const ngraph::Node* node)
+                {
+                    auto qconvolution = static_cast<const OP*>(node);
+                    float min_out_value;
+                    float max_out_value;
+                    quantization_range_for_multiplication<uint8_t, int8_t, int32_t>(
+                        qconvolution->get_input_min(),
+                        qconvolution->get_input_max(),
+                        qconvolution->get_filter_min(),
+                        qconvolution->get_filter_max(),
+                        &min_out_value,
+                        &max_out_value);
+                    const float max_abs32 =
+                        std::max(std::abs(min_out_value), std::abs(max_out_value));
+                    const float max_abs8 =
+                        std::max(std::abs(qconvolution->get_freezed_output_min()),
+                                 std::abs(qconvolution->get_freezed_output_max()));
+                    // Output is signed int.
+                    // s32 = f32 * std::pow(2, 31)/ max_abs32;
+                    // s8 = f32 * std::pow(2, 7)/ max_abs8;
+                    // s8 = s32 * std::pow(2, -24) * max_abs32 / max_abs8;
+                    const float scale = static_cast<float>(
+                        (std::pow(2, -24) * static_cast<double>(max_abs32 / max_abs8)));
+                    return scale;
+                }
            }
        }
    }

--- a/test/quantize_cpu.cpp
+++ b/test/quantize_cpu.cpp
@@ -27,6 +27,7 @@
 #include "ngraph/runtime/cpu/op/quantize.hpp"
 #include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
 #include "ngraph/runtime/cpu/op/quantized_conv.hpp"
+#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
 #include "util/all_close.hpp"
 #include "util/all_close_f.hpp"
@@ -331,3 +332,100 @@ TEST(quantize_cpu, quantize_to_int8)
    EXPECT_EQ((vector<float>{-127}), read_vector<float>(result_min));
    EXPECT_EQ((vector<float>{127}), read_vector<float>(result_max));
 }
+TEST(quantize_cpu, quantizedConv2D_with_relu)
+{
+    Shape shape_a{1, 1, 3, 4}; // input shape
+    Shape shape_b{1, 1, 3, 3}; // filter shape
+    Shape shape_r{1, 1, 3, 4}; // output shape
+    vector<uint8_t> a_data = {1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4};
+    vector<int8_t> b_data = {1, 2, 3, 4, 5, 0, 0, 1, 2};
+    auto A = make_shared<op::Parameter>(element::u8, shape_a);
+    auto B = make_shared<op::Parameter>(element::i8, shape_b);
+    auto C = op::Constant::create(element::f32, Shape{1}, {0.0f});
+    auto D = op::Constant::create(element::f32, Shape{1}, {255.0f});
+    auto E = op::Constant::create(element::f32, Shape{1}, {-127.0f});
+    auto F = op::Constant::create(element::f32, Shape{1}, {127.0f});
+    auto G = op::Constant::create(element::f32, Shape{1}, {22.0f});
+    auto H = op::Constant::create(element::f32, Shape{1}, {90.0f});
+    auto CV = make_shared<op::QuantizedConvolutionRelu>(A,
+                                                        B,
+                                                        Strides{1, 1},        // move_strides
+                                                        Strides{1, 1},        // filter_dilation
+                                                        CoordinateDiff{1, 1}, // below_pads
+                                                        CoordinateDiff{1, 1}, // above_pads
+                                                        Strides{1, 1},        // data_dilation
+                                                        C,
+                                                        D,
+                                                        E,
+                                                        F,
+                                                        G,
+                                                        H);
+    auto output_data = std::make_shared<op::GetOutputElement>(CV, 0);
+    auto output_min = std::make_shared<op::GetOutputElement>(CV, 1);
+    auto output_max = std::make_shared<op::GetOutputElement>(CV, 2);
+    auto f = make_shared<Function>(NodeVector{output_data, output_min, output_max},
+                                   op::ParameterVector{A, B});
+    auto backend = runtime::Backend::create("CPU");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::u8, shape_a);
+    copy_data(a, a_data);
+    auto b = backend->create_tensor(element::i8, shape_b);
+    copy_data(b, b_data);
+    auto result = backend->create_tensor(element::u8, shape_r);
+    auto result_min = backend->create_tensor(element::f32, Shape{1});
+    auto result_max = backend->create_tensor(element::f32, Shape{1});
+    backend->call_with_validate(f, {result, result_min, result_max}, {a, b});
+    EXPECT_EQ((vector<uint8_t>{31, 48, 42, 45, 54, 102, 127, 61, 47, 74, 61, 55}),
+              read_vector<uint8_t>(result));
+    EXPECT_EQ((vector<float>{22.0}), read_vector<float>(result_min));
+    EXPECT_EQ((vector<float>{90.0}), read_vector<float>(result_max));
+}
+TEST(quantize_cpu, quantizedConv2D_fused_relu)
+{
+    Shape shape_a{1, 1, 3, 3}; // input shape
+    Shape shape_b{1, 1, 3, 3}; // filter shape
+    Shape shape_r{1, 1, 3, 3}; // output shape
+    vector<uint8_t> a_data = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    vector<int8_t> b_data = {1, 2, 1, 0, 0, 0, -1, -2, -1};
+    auto A = make_shared<op::Parameter>(element::u8, shape_a);
+    auto B = make_shared<op::Parameter>(element::i8, shape_b);
+    auto C = op::Constant::create(element::f32, Shape{1}, {0.0f});
+    auto D = op::Constant::create(element::f32, Shape{1}, {255.0f});
+    auto E = op::Constant::create(element::f32, Shape{1}, {-127.0f});
+    auto F = op::Constant::create(element::f32, Shape{1}, {127.0f});
+    auto G = op::Constant::create(element::f32, Shape{1}, {20.0f});
+    auto H = op::Constant::create(element::f32, Shape{1}, {-24.0f});
+    auto CV = make_shared<op::QuantizedConvolutionRelu>(A,
+                                                        B,
+                                                        Strides{1, 1},        // move_strides
+                                                        Strides{1, 1},        // filter_dilation
+                                                        CoordinateDiff{1, 1}, // below_pads
+                                                        CoordinateDiff{1, 1}, // above_pads
+                                                        Strides{1, 1},        // data_dilation
+                                                        C,
+                                                        D,
+                                                        E,
+                                                        F,
+                                                        G,
+                                                        H);
+    auto output_data = std::make_shared<op::GetOutputElement>(CV, 0);
+    auto output_min = std::make_shared<op::GetOutputElement>(CV, 1);
+    auto output_max = std::make_shared<op::GetOutputElement>(CV, 2);
+    auto f = make_shared<Function>(NodeVector{output_data, output_min, output_max},
+                                   op::ParameterVector{A, B});
+    auto backend = runtime::Backend::create("CPU");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::u8, shape_a);
+    copy_data(a, a_data);
+    auto b = backend->create_tensor(element::i8, shape_b);
+    copy_data(b, b_data);
+    auto result = backend->create_tensor(element::u8, shape_r);
+    auto result_min = backend->create_tensor(element::f32, Shape{1});
+    auto result_max = backend->create_tensor(element::f32, Shape{1});
+    backend->call_with_validate(f, {result, result_min, result_max}, {a, b});
+    EXPECT_EQ((vector<uint8_t>{0, 0, 0, 0, 0, 0, 69, 106, 90}), read_vector<uint8_t>(result));
+    EXPECT_EQ((vector<float>{20.0}), read_vector<float>(result_min));
+    EXPECT_EQ((vector<float>{-24.0}), read_vector<float>(result_max));
+}