Switch to scale and offset design from min and max for Quantization (#1789)

* Switch to scale and offset design from min and max for Quantization * Remove offset and make the quantize ops a single o/p op * move cpu QuantOps to core and create builders * rebase to HEAD * remove convbias and convbiasrelu ctors which take conv * remove mistakenly added quantize.rst * remove offset * Compute scale, move quantization ops to experimental dir and some PR feedback * Normalize license headers

Switch to scale and offset design from min and max for Quantization (#1789)
* Switch to scale and offset design from min and max for Quantization * Remove offset and make the quantize ops a single o/p op * move cpu QuantOps to core and create builders * rebase to HEAD * remove convbias and convbiasrelu ctors which take conv * remove mistakenly added quantize.rst * remove offset * Compute scale, move quantization ops to experimental dir and some PR feedback * Normalize license headers
f30910c6 · Nishant Patel · Robert Kimball · bcfbf099 · f30910c6 · f30910c6
Commit f30910c6 authored Oct 15, 2018 by Nishant Patel Committed by Robert Kimball Oct 15, 2018
32 changed files
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -20,6 +20,7 @@ set (SRC
    autodiff/adjoints.cpp
    builder/autobroadcast.cpp
    builder/numpy_transpose.cpp
+    builder/quantization.cpp
    builder/reduce_ops.cpp
    coordinate.cpp
    coordinate_diff.cpp
@@ -84,6 +85,11 @@ set (SRC
    op/power.cpp
    op/product.cpp
    op/quantize.cpp
+    op/experimental/quantized_avg_pool.cpp
+    op/experimental/quantized_conv_bias.cpp
+    op/experimental/quantized_conv_relu.cpp
+    op/experimental/quantized_conv.cpp
+    op/experimental/quantized_max_pool.cpp
    op/reduce.cpp
    op/reduce_window.cpp
    op/relu.cpp

--- a/src/ngraph/builder/quantization.cpp
+++ b/src/ngraph/builder/quantization.cpp
--- a/src/ngraph/builder/quantization.hpp
+++ b/src/ngraph/builder/quantization.hpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#pragma once
+
+#include "ngraph/coordinate_diff.hpp"
+#include "ngraph/node.hpp"
+#include "ngraph/op/quantize.hpp"
+
+namespace ngraph
+{
+    namespace builder
+    {
+        std::shared_ptr<Node> ScaledQuantizedAvgPool(const std::shared_ptr<Node>& arg,
+                                                     const Shape& window_shape,
+                                                     const Strides& window_movement_strides,
+                                                     const Shape& padding_below,
+                                                     const Shape& padding_above,
+                                                     bool include_padding_in_avg_computation,
+                                                     const std::shared_ptr<Node> min,
+                                                     const std::shared_ptr<Node> max);
+
+        std::shared_ptr<Node>
+            ScaledQuantizedConvolutionBias(const std::shared_ptr<Node>& data_batch,
+                                           const std::shared_ptr<Node>& filters,
+                                           const std::shared_ptr<Node>& bias,
+                                           const Strides& window_movement_strides,
+                                           const Strides& window_dilation_strides,
+                                           const CoordinateDiff& padding_below,
+                                           const CoordinateDiff& padding_above,
+                                           const Strides& data_dilation_strides,
+                                           const std::shared_ptr<Node> min_input,
+                                           const std::shared_ptr<Node> max_input,
+                                           const std::shared_ptr<Node> min_filter,
+                                           const std::shared_ptr<Node> max_filter,
+                                           const std::shared_ptr<Node> min_freezed_output,
+                                           const std::shared_ptr<Node> max_freezed_output,
+                                           const bool with_relu = false);
+
+        std::shared_ptr<Node>
+            ScaledQuantizedConvolutionRelu(const std::shared_ptr<Node>& data_batch,
+                                           const std::shared_ptr<Node>& filters,
+                                           const Strides& window_movement_strides,
+                                           const Strides& window_dilation_strides,
+                                           const CoordinateDiff& padding_below,
+                                           const CoordinateDiff& padding_above,
+                                           const Strides& data_dilation_strides,
+                                           const std::shared_ptr<Node> min_input,
+                                           const std::shared_ptr<Node> max_input,
+                                           const std::shared_ptr<Node> min_filter,
+                                           const std::shared_ptr<Node> max_filter,
+                                           const std::shared_ptr<Node> min_freezed_output,
+                                           const std::shared_ptr<Node> max_freezed_output);
+
+        std::shared_ptr<Node>
+            ScaledQuantizedConvolution(const std::shared_ptr<Node>& data_batch,
+                                       const std::shared_ptr<Node>& filters,
+                                       const Strides& window_movement_strides,
+                                       const Strides& window_dilation_strides,
+                                       const CoordinateDiff& padding_below,
+                                       const CoordinateDiff& padding_above,
+                                       const Strides& data_dilation_strides,
+                                       const std::shared_ptr<Node> min_input,
+                                       const std::shared_ptr<Node> max_input,
+                                       const std::shared_ptr<Node> min_filter,
+                                       const std::shared_ptr<Node> max_filter,
+                                       const std::shared_ptr<Node> min_freezed_output,
+                                       const std::shared_ptr<Node> max_freezed_output);
+
+        std::shared_ptr<Node> ScaledQuantizedMaxPool(const std::shared_ptr<Node>& arg,
+                                                     const Shape& window_shape,
+                                                     const Strides& window_movement_strides,
+                                                     const Shape& padding_below,
+                                                     const Shape& padding_above,
+                                                     const std::shared_ptr<Node> min,
+                                                     const std::shared_ptr<Node> max);
+    }
+}
--- a/src/ngraph/builder/quantization_util.hpp
+++ b/src/ngraph/builder/quantization_util.hpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+//*******************************************************************************
+//  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//  Licensed under the Apache License, Version 2.0 (the "License");
+//  you may not use this file except in compliance with the License.
+//  You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software
+//  distributed under the License is distributed on an "AS IS" BASIS,
+//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  See the License for the specific language governing permissions and
+//  limitations under the License.
+//==============================================================================
+
+#pragma once
+
+#include <limits>
+#include <vector>
+#include "ngraph/node.hpp"
+#include "ngraph/op/constant.hpp"
+#include "ngraph/util.hpp"
+
+namespace ngraph
+{
+    namespace builder
+    {
+        namespace quantization_util
+        {
+            template <class T1, class T2, class T3>
+            void quantization_range_for_multiplication(
+                float min_a, float max_a, float min_b, float max_b, float* min_c, float* max_c)
+            {
+                // begin code copied and pasted (and modified) from
+                // github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/quantization_utils.h
+                float a_one_quant_level = (max_a - min_a) / (std::numeric_limits<T1>::max() -
+                                                             std::numeric_limits<T1>::min());
+                float b_one_quant_level = (max_b - min_b) / (std::numeric_limits<T2>::max() -
+                                                             std::numeric_limits<T2>::min());
+                float c_one_quant_level = a_one_quant_level * b_one_quant_level;
+                *min_c = c_one_quant_level * std::numeric_limits<T3>::min();
+                *max_c = c_one_quant_level * std::numeric_limits<T3>::max();
+                // end code copied and pasted (and modified) from
+                // github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/quantization_utils.h
+            }
+
+            float get_scale(const std::shared_ptr<Node> min_input,
+                            const std::shared_ptr<Node> max_input,
+                            const std::shared_ptr<Node> min_filter,
+                            const std::shared_ptr<Node> max_filter,
+                            const std::shared_ptr<Node> min_freezed_output,
+                            const std::shared_ptr<Node> max_freezed_output)
+            {
+                auto min_input_const_op = std::static_pointer_cast<ngraph::op::Constant>(min_input);
+                auto max_input_const_op = std::static_pointer_cast<ngraph::op::Constant>(max_input);
+                auto min_filter_const_op =
+                    std::static_pointer_cast<ngraph::op::Constant>(min_filter);
+                auto max_filter_const_op =
+                    std::static_pointer_cast<ngraph::op::Constant>(max_filter);
+                auto min_freezed_output_const_op =
+                    std::static_pointer_cast<ngraph::op::Constant>(min_freezed_output);
+                auto max_freezed_output_const_op =
+                    std::static_pointer_cast<ngraph::op::Constant>(max_freezed_output);
+                float input_min = *(static_cast<float const*>(min_input_const_op->get_data_ptr()));
+                float input_max = *(static_cast<float const*>(max_input_const_op->get_data_ptr()));
+                float filter_min =
+                    *(static_cast<float const*>(min_filter_const_op->get_data_ptr()));
+                float filter_max =
+                    *(static_cast<float const*>(max_filter_const_op->get_data_ptr()));
+                float output_min =
+                    *(static_cast<float const*>(min_freezed_output_const_op->get_data_ptr()));
+                float output_max =
+                    *(static_cast<float const*>(max_freezed_output_const_op->get_data_ptr()));
+
+                float min_out_value;
+                float max_out_value;
+                quantization_range_for_multiplication<uint8_t, int8_t, int32_t>(
+                    input_min, input_max, filter_min, filter_max, &min_out_value, &max_out_value);
+                const float max_abs32 = std::max(std::abs(min_out_value), std::abs(max_out_value));
+                const float max_abs8 = std::max(std::abs(output_min), std::abs(output_max));
+                // Output is signed int.
+                // s32 = f32 * std::pow(2, 31)/ max_abs32;
+                // s8 = f32 * std::pow(2, 7)/ max_abs8;
+                // s8 = s32 * std::pow(2, -24) * max_abs32 / max_abs8;
+                const float scale = static_cast<float>(
+                    (std::pow(2, -24) * static_cast<double>(max_abs32 / max_abs8)));
+                return scale;
+            }
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/op/quantized_avg_pool.cpp
+++ b/src/ngraph/runtime/cpu/op/quantized_avg_pool.cpp
@@ -26,10 +26,8 @@ op::QuantizedAvgPool::QuantizedAvgPool(const shared_ptr<Node>& arg,
                                       const Strides& window_movement_strides,
                                       const Shape& padding_below,
                                       const Shape& padding_above,
-                                       bool include_padding_in_avg_computation,
-                                       const shared_ptr<Node> min,
-                                       const shared_ptr<Node> max)
-    : Op("QuantizedAvgPool", check_single_output_args({arg, min, max}))
+                                       bool include_padding_in_avg_computation)
+    : Op("QuantizedAvgPool", check_single_output_args({arg}))
    , m_window_shape(window_shape)
    , m_window_movement_strides(window_movement_strides)
    , m_padding_below(padding_below)
@@ -40,18 +38,7 @@ op::QuantizedAvgPool::QuantizedAvgPool(const shared_ptr<Node>& arg,

    if (arg->get_element_type() != element::u8 && arg->get_element_type() != element::i8)
    {
-        throw ngraph_error("Dequantization supported only for i8/u8!");
-    }
-
-    if (min->get_element_type() != max->get_element_type())
-    {
-        throw ngraph_error("Min's element type isn't equal to max's!");
-    }
-
-    if (!(std::dynamic_pointer_cast<op::Constant>(min) &&
-          std::dynamic_pointer_cast<op::Constant>(max)))
-    {
-        throw ngraph_error("Min and max have to be constants!");
+        throw ngraph_error("QuantizedAvgPool supported only for i8/u8!");
    }
 }

@@ -211,11 +198,7 @@ void op::QuantizedAvgPool::validate_and_infer_types()
    result_shape[1] = channel_count;
    copy(output_item_shape.begin(), output_item_shape.end(), result_shape.begin() + 2);

-    set_output_size(3);
    set_output_type(0, get_input_element_type(0), result_shape);
-    //TODO(nbpatel): Change to Shape{} once the mkldnn version is updated
-    set_output_type(1, element::f32, Shape{1});
-    set_output_type(2, element::f32, Shape{1});
 }

 shared_ptr<Node> op::QuantizedAvgPool::copy_with_new_args(const NodeVector& new_args) const
@@ -226,7 +209,5 @@ shared_ptr<Node> op::QuantizedAvgPool::copy_with_new_args(const NodeVector& new_
                                         m_window_movement_strides,
                                         m_padding_below,
                                         m_padding_above,
-                                         m_include_padding_in_avg_computation,
-                                         new_args.at(1),
-                                         new_args.at(2));
+                                         m_include_padding_in_avg_computation);
 }
--- a/src/ngraph/runtime/cpu/op/quantized_avg_pool.hpp
+++ b/src/ngraph/runtime/cpu/op/quantized_avg_pool.hpp
@@ -48,9 +48,7 @@ namespace ngraph
                             const Strides& window_movement_strides,
                             const Shape& padding_below,
                             const Shape& padding_above,
-                             bool include_padding_in_avg_computation,
-                             const std::shared_ptr<Node> min,
-                             const std::shared_ptr<Node> max);
+                             bool include_padding_in_avg_computation);
            void validate_and_infer_types() override;
            virtual std::shared_ptr<Node>
                copy_with_new_args(const NodeVector& new_args) const override;

--- a/src/ngraph/runtime/cpu/op/quantized_conv.cpp
+++ b/src/ngraph/runtime/cpu/op/quantized_conv.cpp
@@ -30,21 +30,8 @@ op::QuantizedConvolution::QuantizedConvolution(const shared_ptr<Node>& data_batc
                                               const CoordinateDiff& padding_below,
                                               const CoordinateDiff& padding_above,
                                               const Strides& data_dilation_strides,
-                                               const std::shared_ptr<Node> min_input,
-                                               const std::shared_ptr<Node> max_input,
-                                               const std::shared_ptr<Node> min_filter,
-                                               const std::shared_ptr<Node> max_filter,
-                                               const std::shared_ptr<Node> min_freezed_output,
-                                               const std::shared_ptr<Node> max_freezed_output)
-    : Op("QuantizedConvolution",
-         check_single_output_args({data_batch,
-                                   filters,
-                                   min_input,
-                                   max_input,
-                                   min_filter,
-                                   max_filter,
-                                   min_freezed_output,
-                                   max_freezed_output}))
+                                               const std::shared_ptr<Node> scale)
+    : Op("QuantizedConvolution", check_single_output_args({data_batch, filters, scale}))
    , m_window_movement_strides(window_movement_strides)
    , m_window_dilation_strides(window_dilation_strides)
    , m_padding_below(padding_below)
@@ -58,29 +45,11 @@ op::QuantizedConvolution::QuantizedConvolution(const shared_ptr<Node>& data_batc
    auto& data_batch_shape = data_batch->get_shape();
    auto& filters_shape = filters->get_shape();

-    auto min_input_const_op = std::static_pointer_cast<ngraph::op::Constant>(min_input);
-    auto max_input_const_op = std::static_pointer_cast<ngraph::op::Constant>(max_input);
-    auto min_filter_const_op = std::static_pointer_cast<ngraph::op::Constant>(min_filter);
-    auto max_filter_const_op = std::static_pointer_cast<ngraph::op::Constant>(max_filter);
-    auto min_freezed_output_const_op =
-        std::static_pointer_cast<ngraph::op::Constant>(min_freezed_output);
-    auto max_freezed_output_const_op =
-        std::static_pointer_cast<ngraph::op::Constant>(max_freezed_output);
-    float input_min = *(static_cast<float const*>(min_input_const_op->get_data_ptr()));
-    float input_max = *(static_cast<float const*>(max_input_const_op->get_data_ptr()));
-    float filter_min = *(static_cast<float const*>(min_filter_const_op->get_data_ptr()));
-    float filter_max = *(static_cast<float const*>(max_filter_const_op->get_data_ptr()));
-    float output_min = *(static_cast<float const*>(min_freezed_output_const_op->get_data_ptr()));
-    float output_max = *(static_cast<float const*>(max_freezed_output_const_op->get_data_ptr()));
+    auto scale_const_op = std::static_pointer_cast<ngraph::op::Constant>(scale);
+    float scale_val = *(static_cast<float const*>(scale_const_op->get_data_ptr()));

-    this->m_input_min = input_min;
-    this->m_input_max = input_max;
-    this->m_filter_min = filter_min;
-    this->m_filter_max = filter_max;
-    this->m_freezed_output_min = output_min;
-    this->m_freezed_output_max = output_max;
+    this->m_scale = scale_val;

-    set_output_size(3);
    set_output_type(0,
                    element::i8,
                    util::infer_convolution_output_shape(this,
@@ -98,12 +67,10 @@ op::QuantizedConvolution::QuantizedConvolution(const shared_ptr<Node>& data_batc
                                                         0, /* batch_axis_result,            */
                                                         1  /* output_channel_axis_result,   */
                                                         ));
-    set_output_type(1, element::f32, Shape{1});
-    set_output_type(2, element::f32, Shape{1});
 }
 shared_ptr<Node> op::QuantizedConvolution::copy_with_new_args(const NodeVector& new_args) const
 {
-    if (new_args.size() != 8)
+    if (new_args.size() != 3)
    {
        throw ngraph_error("Incorrect number of new arguments");
    }
@@ -114,10 +81,5 @@ shared_ptr<Node> op::QuantizedConvolution::copy_with_new_args(const NodeVector&
                                                     get_padding_below(),
                                                     get_padding_above(),
                                                     get_data_dilation_strides(),
-                                                     new_args.at(2),
-                                                     new_args.at(3),
-                                                     new_args.at(4),
-                                                     new_args.at(5),
-                                                     new_args.at(6),
-                                                     new_args.at(7)));
+                                                     new_args.at(2)));
 }
--- a/src/ngraph/runtime/cpu/op/quantized_conv.hpp
+++ b/src/ngraph/runtime/cpu/op/quantized_conv.hpp
@@ -33,12 +33,7 @@ namespace ngraph
                                 const CoordinateDiff& padding_below,
                                 const CoordinateDiff& padding_above,
                                 const Strides& data_dilation_strides,
-                                 const std::shared_ptr<Node> min_input,
-                                 const std::shared_ptr<Node> max_input,
-                                 const std::shared_ptr<Node> min_filter,
-                                 const std::shared_ptr<Node> max_filter,
-                                 const std::shared_ptr<Node> min_freezed_output,
-                                 const std::shared_ptr<Node> max_freezed_output);
+                                 const std::shared_ptr<Node> scale);
            const Strides& get_window_movement_strides() const { return m_window_movement_strides; }
            const Strides& get_window_dilation_strides() const { return m_window_dilation_strides; }
            const CoordinateDiff& get_padding_below() const { return m_padding_below; }
@@ -46,12 +41,7 @@ namespace ngraph
            const Strides& get_data_dilation_strides() const { return m_data_dilation_strides; }
            std::shared_ptr<Node> get_filters() { return get_argument(1); }
            std::shared_ptr<Node> get_data_batch() { return get_argument(0); }
-            float get_input_min() const { return m_input_min; }
-            float get_input_max() const { return m_input_max; }
-            float get_filter_min() const { return m_filter_min; }
-            float get_filter_max() const { return m_filter_max; }
-            float get_freezed_output_min() const { return m_freezed_output_min; }
-            float get_freezed_output_max() const { return m_freezed_output_max; }
+            float get_scale() const { return m_scale; }
            virtual std::shared_ptr<Node>
                copy_with_new_args(const NodeVector& new_args) const override;

@@ -61,12 +51,7 @@ namespace ngraph
            CoordinateDiff m_padding_below;
            CoordinateDiff m_padding_above;
            Strides m_data_dilation_strides;
-            float m_input_min;
-            float m_input_max;
-            float m_filter_min;
-            float m_filter_max;
-            float m_freezed_output_min;
-            float m_freezed_output_max;
+            float m_scale;
        };
    }
 }
--- a/src/ngraph/runtime/cpu/op/quantized_conv_bias.cpp
+++ b/src/ngraph/runtime/cpu/op/quantized_conv_bias.cpp
@@ -16,84 +16,27 @@

 #include <numeric>

-#include "conv_bias.hpp"
 #include "quantized_conv_bias.hpp"

 #include "ngraph/op/constant.hpp"
+#include "ngraph/op/experimental/quantized_conv.hpp"
 #include "ngraph/op/get_output_element.hpp"
-#include "ngraph/runtime/cpu/op/quantized_conv.hpp"
 #include "ngraph/util.hpp"

 using namespace std;
 using namespace ngraph;

-op::QuantizedConvolutionBias::QuantizedConvolutionBias(
-    const shared_ptr<op::QuantizedConvolution>& qconv,
-    const shared_ptr<Node>& bias,
-    const bool with_relu)
-    : Op("QuantizedConvolutionBias",
-         check_single_output_args({qconv->get_argument(0),
-                                   qconv->get_argument(1),
-                                   bias,
-                                   qconv->get_argument(2),
-                                   qconv->get_argument(3),
-                                   qconv->get_argument(4),
-                                   qconv->get_argument(5),
-                                   qconv->get_argument(6),
-                                   qconv->get_argument(7)}))
-    , m_window_movement_strides(qconv->get_window_movement_strides())
-    , m_window_dilation_strides(qconv->get_window_dilation_strides())
-    , m_padding_below(qconv->get_padding_below())
-    , m_padding_above(qconv->get_padding_above())
-    , m_data_dilation_strides(qconv->get_data_dilation_strides())
-    , m_with_relu(with_relu)
-{
-    constructor_validate_and_infer_types();
-
-    this->m_input_min = qconv->get_input_min();
-    this->m_input_max = qconv->get_input_max();
-    this->m_filter_min = qconv->get_filter_min();
-    this->m_filter_max = qconv->get_filter_max();
-    this->m_freezed_output_min = qconv->get_freezed_output_min();
-    this->m_freezed_output_max = qconv->get_freezed_output_max();
-
-    util::validate_convbias_shapes(qconv->get_argument(0)->get_shape(),
-                                   qconv->get_argument(1)->get_shape(),
-                                   bias->get_shape());
-
-    auto output_et = with_relu ? element::u8 : element::i8;
-    set_output_size(3);
-    set_output_type(0, output_et, qconv->get_shape());
-    set_output_type(1, element::f32, Shape{1});
-    set_output_type(2, element::f32, Shape{1});
-}
-
-op::QuantizedConvolutionBias::QuantizedConvolutionBias(
-    const shared_ptr<Node>& data_batch,
-    const shared_ptr<Node>& filters,
-    const shared_ptr<Node>& bias,
-    const Strides& window_movement_strides,
-    const Strides& window_dilation_strides,
-    const CoordinateDiff& padding_below,
-    const CoordinateDiff& padding_above,
-    const Strides& data_dilation_strides,
-    const std::shared_ptr<Node> min_input,
-    const std::shared_ptr<Node> max_input,
-    const std::shared_ptr<Node> min_filter,
-    const std::shared_ptr<Node> max_filter,
-    const std::shared_ptr<Node> min_freezed_output,
-    const std::shared_ptr<Node> max_freezed_output,
-    const bool with_relu)
-    : Op("QuantizedConvolutionBias",
-         check_single_output_args({data_batch,
-                                   filters,
-                                   bias,
-                                   min_input,
-                                   max_input,
-                                   min_filter,
-                                   max_filter,
-                                   min_freezed_output,
-                                   max_freezed_output}))
+op::QuantizedConvolutionBias::QuantizedConvolutionBias(const shared_ptr<Node>& data_batch,
+                                                       const shared_ptr<Node>& filters,
+                                                       const shared_ptr<Node>& bias,
+                                                       const Strides& window_movement_strides,
+                                                       const Strides& window_dilation_strides,
+                                                       const CoordinateDiff& padding_below,
+                                                       const CoordinateDiff& padding_above,
+                                                       const Strides& data_dilation_strides,
+                                                       const std::shared_ptr<Node> scale,
+                                                       const bool with_relu)
+    : Op("QuantizedConvolutionBias", check_single_output_args({data_batch, filters, bias, scale}))
    , m_window_movement_strides(window_movement_strides)
    , m_window_dilation_strides(window_dilation_strides)
    , m_padding_below(padding_below)
@@ -106,31 +49,14 @@ op::QuantizedConvolutionBias::QuantizedConvolutionBias(
    auto& data_batch_shape = data_batch->get_shape();
    auto& filters_shape = filters->get_shape();

-    auto min_input_const_op = std::static_pointer_cast<ngraph::op::Constant>(min_input);
-    auto max_input_const_op = std::static_pointer_cast<ngraph::op::Constant>(max_input);
-    auto min_filter_const_op = std::static_pointer_cast<ngraph::op::Constant>(min_filter);
-    auto max_filter_const_op = std::static_pointer_cast<ngraph::op::Constant>(max_filter);
-    auto min_freezed_output_const_op =
-        std::static_pointer_cast<ngraph::op::Constant>(min_freezed_output);
-    auto max_freezed_output_const_op =
-        std::static_pointer_cast<ngraph::op::Constant>(max_freezed_output);
-    float input_min = *(static_cast<float const*>(min_input_const_op->get_data_ptr()));
-    float input_max = *(static_cast<float const*>(max_input_const_op->get_data_ptr()));
-    float filter_min = *(static_cast<float const*>(min_filter_const_op->get_data_ptr()));
-    float filter_max = *(static_cast<float const*>(max_filter_const_op->get_data_ptr()));
-    float output_min = *(static_cast<float const*>(min_freezed_output_const_op->get_data_ptr()));
-    float output_max = *(static_cast<float const*>(max_freezed_output_const_op->get_data_ptr()));
-    this->m_input_min = input_min;
-    this->m_input_max = input_max;
-    this->m_filter_min = filter_min;
-    this->m_filter_max = filter_max;
-    this->m_freezed_output_min = output_min;
-    this->m_freezed_output_max = output_max;
+    auto scale_const_op = std::static_pointer_cast<ngraph::op::Constant>(scale);
+    float scale_val = *(static_cast<float const*>(scale_const_op->get_data_ptr()));
+    this->m_scale = scale_val;

-    util::validate_convbias_shapes(data_batch_shape, filters_shape, bias->get_shape());
+    // TODO: call ngraph util
+    // util::validate_convbias_shapes(data_batch_shape, filters_shape, bias->get_shape());

    auto output_et = with_relu ? element::u8 : element::i8;
-    set_output_size(3);
    set_output_type(0,
                    output_et,
                    util::infer_convolution_output_shape(this,
@@ -148,13 +74,11 @@ op::QuantizedConvolutionBias::QuantizedConvolutionBias(
                                                         0, /* batch_axis_result,            */
                                                         1  /* output_channel_axis_result,   */
                                                         ));
-    set_output_type(1, element::f32, Shape{1});
-    set_output_type(2, element::f32, Shape{1});
 }

 shared_ptr<Node> op::QuantizedConvolutionBias::copy_with_new_args(const NodeVector& new_args) const
 {
-    if (new_args.size() != 9)
+    if (new_args.size() != 4)
    {
        throw ngraph_error("Incorrect number of new arguments");
    }
@@ -168,10 +92,5 @@ shared_ptr<Node> op::QuantizedConvolutionBias::copy_with_new_args(const NodeVect
                                                         get_padding_above(),
                                                         get_data_dilation_strides(),
                                                         new_args.at(3),
-                                                         new_args.at(4),
-                                                         new_args.at(5),
-                                                         new_args.at(6),
-                                                         new_args.at(7),
-                                                         new_args.at(8),
                                                         m_with_relu));
 }
--- a/src/ngraph/runtime/cpu/op/quantized_conv_bias.hpp
+++ b/src/ngraph/runtime/cpu/op/quantized_conv_bias.hpp
@@ -16,8 +16,8 @@

 #pragma once

+#include "ngraph/op/experimental/quantized_conv.hpp"
 #include "ngraph/op/op.hpp"
-#include "ngraph/runtime/cpu/op/quantized_conv.hpp"

 namespace ngraph
 {
@@ -39,12 +39,7 @@ namespace ngraph
                                     const CoordinateDiff& padding_below,
                                     const CoordinateDiff& padding_above,
                                     const Strides& data_dilation_strides,
-                                     const std::shared_ptr<Node> min_input,
-                                     const std::shared_ptr<Node> max_input,
-                                     const std::shared_ptr<Node> min_filter,
-                                     const std::shared_ptr<Node> max_filter,
-                                     const std::shared_ptr<Node> min_freezed_output,
-                                     const std::shared_ptr<Node> max_freezed_output,
+                                     const std::shared_ptr<Node> scale,
                                     const bool with_relu = false);

            const Strides& get_window_movement_strides() const { return m_window_movement_strides; }
@@ -52,12 +47,7 @@ namespace ngraph
            const CoordinateDiff& get_padding_below() const { return m_padding_below; }
            const CoordinateDiff& get_padding_above() const { return m_padding_above; }
            const Strides& get_data_dilation_strides() const { return m_data_dilation_strides; }
-            float get_input_min() const { return m_input_min; }
-            float get_input_max() const { return m_input_max; }
-            float get_filter_min() const { return m_filter_min; }
-            float get_filter_max() const { return m_filter_max; }
-            float get_freezed_output_min() const { return m_freezed_output_min; }
-            float get_freezed_output_max() const { return m_freezed_output_max; }
+            float get_scale() const { return m_scale; }
            std::shared_ptr<Node> get_bias() { return get_argument(2); }
            std::shared_ptr<Node> get_filters() { return get_argument(1); }
            std::shared_ptr<Node> get_data_batch() { return get_argument(0); }
@@ -72,12 +62,7 @@ namespace ngraph
            CoordinateDiff m_padding_above;
            Strides m_data_dilation_strides;
            bool m_with_relu;
-            float m_input_min;
-            float m_input_max;
-            float m_filter_min;
-            float m_filter_max;
-            float m_freezed_output_min;
-            float m_freezed_output_max;
+            float m_scale;
        };
    }
 }
--- a/src/ngraph/runtime/cpu/op/quantized_conv_relu.cpp
+++ b/src/ngraph/runtime/cpu/op/quantized_conv_relu.cpp
-//*****************************************************************************
-// Copyright 2017-2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/

 #include <numeric>

 #include "ngraph/op/constant.hpp"
+#include "ngraph/op/experimental/quantized_conv_relu.hpp"
 #include "ngraph/op/get_output_element.hpp"
-#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
 #include "ngraph/util.hpp"

 using namespace std;
 using namespace ngraph;

-op::QuantizedConvolutionRelu::QuantizedConvolutionRelu(
-    const std::shared_ptr<op::QuantizedConvolution>& qconv)
-    : Op("QuantizedConvolutionRelu",
-         check_single_output_args({qconv->get_argument(0),
-                                   qconv->get_argument(1),
-                                   qconv->get_argument(2),
-                                   qconv->get_argument(3),
-                                   qconv->get_argument(4),
-                                   qconv->get_argument(5),
-                                   qconv->get_argument(6),
-                                   qconv->get_argument(7)}))
-    , m_window_movement_strides(qconv->get_window_movement_strides())
-    , m_window_dilation_strides(qconv->get_window_dilation_strides())
-    , m_padding_below(qconv->get_padding_below())
-    , m_padding_above(qconv->get_padding_above())
-    , m_data_dilation_strides(qconv->get_data_dilation_strides())
-{
-    constructor_validate_and_infer_types();
-
-    this->m_input_min = qconv->get_input_min();
-    this->m_input_max = qconv->get_input_max();
-    this->m_filter_min = qconv->get_filter_min();
-    this->m_filter_max = qconv->get_filter_max();
-    this->m_freezed_output_min = qconv->get_freezed_output_min();
-    this->m_freezed_output_max = qconv->get_freezed_output_max();
-
-    set_output_size(3);
-    set_output_type(0, element::u8, qconv->get_shape());
-    set_output_type(1, element::f32, Shape{1});
-    set_output_type(2, element::f32, Shape{1});
-}
-
-op::QuantizedConvolutionRelu::QuantizedConvolutionRelu(
-    const std::shared_ptr<Node>& data_batch,
-    const std::shared_ptr<Node>& filters,
-    const Strides& window_movement_strides,
-    const Strides& window_dilation_strides,
-    const CoordinateDiff& padding_below,
-    const CoordinateDiff& padding_above,
-    const Strides& data_dilation_strides,
-    const std::shared_ptr<Node> min_input,
-    const std::shared_ptr<Node> max_input,
-    const std::shared_ptr<Node> min_filter,
-    const std::shared_ptr<Node> max_filter,
-    const std::shared_ptr<Node> min_freezed_output,
-    const std::shared_ptr<Node> max_freezed_output)
-    : Op("QuantizedConvolutionRelu",
-         check_single_output_args({data_batch,
-                                   filters,
-                                   min_input,
-                                   max_input,
-                                   min_filter,
-                                   max_filter,
-                                   min_freezed_output,
-                                   max_freezed_output}))
+op::QuantizedConvolutionRelu::QuantizedConvolutionRelu(const std::shared_ptr<Node>& data_batch,
+                                                       const std::shared_ptr<Node>& filters,
+                                                       const Strides& window_movement_strides,
+                                                       const Strides& window_dilation_strides,
+                                                       const CoordinateDiff& padding_below,
+                                                       const CoordinateDiff& padding_above,
+                                                       const Strides& data_dilation_strides,
+                                                       const std::shared_ptr<Node> scale)
+    : Op("QuantizedConvolutionRelu", check_single_output_args({data_batch, filters, scale}))
    , m_window_movement_strides(window_movement_strides)
    , m_window_dilation_strides(window_dilation_strides)
    , m_padding_below(padding_below)
@@ -90,28 +44,10 @@ op::QuantizedConvolutionRelu::QuantizedConvolutionRelu(
    auto& data_batch_shape = data_batch->get_shape();
    auto& filters_shape = filters->get_shape();

-    auto min_input_const_op = std::static_pointer_cast<ngraph::op::Constant>(min_input);
-    auto max_input_const_op = std::static_pointer_cast<ngraph::op::Constant>(max_input);
-    auto min_filter_const_op = std::static_pointer_cast<ngraph::op::Constant>(min_filter);
-    auto max_filter_const_op = std::static_pointer_cast<ngraph::op::Constant>(max_filter);
-    auto min_freezed_output_const_op =
-        std::static_pointer_cast<ngraph::op::Constant>(min_freezed_output);
-    auto max_freezed_output_const_op =
-        std::static_pointer_cast<ngraph::op::Constant>(max_freezed_output);
-    float input_min = *(static_cast<float const*>(min_input_const_op->get_data_ptr()));
-    float input_max = *(static_cast<float const*>(max_input_const_op->get_data_ptr()));
-    float filter_min = *(static_cast<float const*>(min_filter_const_op->get_data_ptr()));
-    float filter_max = *(static_cast<float const*>(max_filter_const_op->get_data_ptr()));
-    float output_min = *(static_cast<float const*>(min_freezed_output_const_op->get_data_ptr()));
-    float output_max = *(static_cast<float const*>(max_freezed_output_const_op->get_data_ptr()));
-    this->m_input_min = input_min;
-    this->m_input_max = input_max;
-    this->m_filter_min = filter_min;
-    this->m_filter_max = filter_max;
-    this->m_freezed_output_min = output_min;
-    this->m_freezed_output_max = output_max;
+    auto scale_const_op = std::static_pointer_cast<ngraph::op::Constant>(scale);
+    float scale_val = *(static_cast<float const*>(scale_const_op->get_data_ptr()));
+    this->m_scale = scale_val;

-    set_output_size(3);
    set_output_type(0,
                    element::u8,
                    util::infer_convolution_output_shape(this,
@@ -129,15 +65,12 @@ op::QuantizedConvolutionRelu::QuantizedConvolutionRelu(
                                                         0, /* batch_axis_result,            */
                                                         1  /* output_channel_axis_result,   */
                                                         ));
-
-    set_output_type(1, element::f32, Shape{1});
-    set_output_type(2, element::f32, Shape{1});
 }

 std::shared_ptr<Node>
    op::QuantizedConvolutionRelu::copy_with_new_args(const NodeVector& new_args) const
 {
-    if (new_args.size() != 8)
+    if (new_args.size() != 3)
    {
        throw ngraph_error("Incorrect number of new arguments");
    }
@@ -149,10 +82,5 @@ std::shared_ptr<Node>
                                                              get_padding_below(),
                                                              get_padding_above(),
                                                              get_data_dilation_strides(),
-                                                              new_args.at(2),
-                                                              new_args.at(3),
-                                                              new_args.at(4),
-                                                              new_args.at(5),
-                                                              new_args.at(6),
-                                                              new_args.at(7)));
+                                                              new_args.at(2)));
 }
--- a/src/ngraph/runtime/cpu/op/quantized_conv_relu.hpp
+++ b/src/ngraph/runtime/cpu/op/quantized_conv_relu.hpp
@@ -16,9 +16,8 @@

 #pragma once

+#include "ngraph/op/experimental/quantized_conv.hpp"
 #include "ngraph/op/op.hpp"
-#include "ngraph/runtime/cpu/op/conv_bias.hpp"
-#include "ngraph/runtime/cpu/op/quantized_conv.hpp"

 namespace ngraph
 {
@@ -37,24 +36,14 @@ namespace ngraph
                                     const CoordinateDiff& padding_below,
                                     const CoordinateDiff& padding_above,
                                     const Strides& data_dilation_strides,
-                                     const std::shared_ptr<Node> min_input,
-                                     const std::shared_ptr<Node> max_input,
-                                     const std::shared_ptr<Node> min_filter,
-                                     const std::shared_ptr<Node> max_filter,
-                                     const std::shared_ptr<Node> min_freezed_output,
-                                     const std::shared_ptr<Node> max_freezed_output);
+                                     const std::shared_ptr<Node> scale);

            const Strides& get_window_movement_strides() const { return m_window_movement_strides; }
            const Strides& get_window_dilation_strides() const { return m_window_dilation_strides; }
            const CoordinateDiff& get_padding_below() const { return m_padding_below; }
            const CoordinateDiff& get_padding_above() const { return m_padding_above; }
            const Strides& get_data_dilation_strides() const { return m_data_dilation_strides; }
-            float get_input_min() const { return m_input_min; }
-            float get_input_max() const { return m_input_max; }
-            float get_filter_min() const { return m_filter_min; }
-            float get_filter_max() const { return m_filter_max; }
-            float get_freezed_output_min() const { return m_freezed_output_min; }
-            float get_freezed_output_max() const { return m_freezed_output_max; }
+            float get_scale() const { return m_scale; }
            std::shared_ptr<Node> get_filters() { return get_argument(1); }
            std::shared_ptr<Node> get_data_batch() { return get_argument(0); }
            virtual std::shared_ptr<Node>
@@ -66,12 +55,7 @@ namespace ngraph
            CoordinateDiff m_padding_below;
            CoordinateDiff m_padding_above;
            Strides m_data_dilation_strides;
-            float m_input_min;
-            float m_input_max;
-            float m_filter_min;
-            float m_filter_max;
-            float m_freezed_output_min;
-            float m_freezed_output_max;
+            float m_scale;
        };
    }
 }
--- a/src/ngraph/runtime/cpu/op/quantized_max_pool.cpp
+++ b/src/ngraph/runtime/cpu/op/quantized_max_pool.cpp
@@ -26,10 +26,8 @@ op::QuantizedMaxPool::QuantizedMaxPool(const shared_ptr<Node>& arg,
                                       const Shape& window_shape,
                                       const Strides& window_movement_strides,
                                       const Shape& padding_below,
-                                       const Shape& padding_above,
-                                       const shared_ptr<Node> min,
-                                       const shared_ptr<Node> max)
-    : Op("QuantizedMaxPool", check_single_output_args({arg, min, max}))
+                                       const Shape& padding_above)
+    : Op("QuantizedMaxPool", check_single_output_args({arg}))
    , m_window_shape(window_shape)
    , m_window_movement_strides(window_movement_strides)
    , m_padding_below(padding_below)
@@ -39,18 +37,7 @@ op::QuantizedMaxPool::QuantizedMaxPool(const shared_ptr<Node>& arg,

    if (arg->get_element_type() != element::u8 && arg->get_element_type() != element::i8)
    {
-        throw ngraph_error("Dequantization supported only for i8/u8!");
-    }
-
-    if (min->get_element_type() != max->get_element_type())
-    {
-        throw ngraph_error("Min's element type isn't equal to max's!");
-    }
-
-    if (!(std::dynamic_pointer_cast<op::Constant>(min) &&
-          std::dynamic_pointer_cast<op::Constant>(max)))
-    {
-        throw ngraph_error("Min and max have to be constants!");
+        throw ngraph_error("QuantizedMaxPool supported only for i8/u8!");
    }
 }

@@ -171,11 +158,7 @@ void op::QuantizedMaxPool::validate_and_infer_types()
    result_shape[1] = channel_count;
    copy(output_item_shape.begin(), output_item_shape.end(), result_shape.begin() + 2);

-    set_output_size(3);
    set_output_type(0, get_input_element_type(0), result_shape);
-    //TODO(nbpatel): Change to Shape{} once the mkldnn version is updated.
-    set_output_type(1, element::f32, Shape{1});
-    set_output_type(2, element::f32, Shape{1});
 }

 shared_ptr<Node> op::QuantizedMaxPool::copy_with_new_args(const NodeVector& new_args) const
@@ -185,7 +168,5 @@ shared_ptr<Node> op::QuantizedMaxPool::copy_with_new_args(const NodeVector& new_
                                         m_window_shape,
                                         m_window_movement_strides,
                                         m_padding_below,
-                                         m_padding_above,
-                                         new_args.at(1),
-                                         new_args.at(2));
+                                         m_padding_above);
 }
--- a/src/ngraph/runtime/cpu/op/quantized_max_pool.hpp
+++ b/src/ngraph/runtime/cpu/op/quantized_max_pool.hpp
@@ -37,9 +37,7 @@ namespace ngraph
                             const Shape& window_shape,
                             const Strides& window_movement_strides,
                             const Shape& padding_below,
-                             const Shape& padding_above,
-                             const std::shared_ptr<Node> min,
-                             const std::shared_ptr<Node> max);
+                             const Shape& padding_above);
            void validate_and_infer_types() override;
            virtual std::shared_ptr<Node>
                copy_with_new_args(const NodeVector& new_args) const override;

--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -40,8 +40,6 @@ set(SRC
    builder/convert_layout.cpp
    builder/quantized_conv.cpp
    builder/convolution.cpp
-    builder/dequantize.cpp
-    builder/quantize.cpp
    builder/dot.cpp
    builder/function_call.cpp
    builder/lstm.cpp
@@ -85,18 +83,11 @@ set(SRC
    op/group_conv.cpp
    op/conv_bias.cpp
    op/conv_relu.cpp
-    op/quantized_conv.cpp
    op/convert_layout.cpp
-    op/dequantize.cpp
-    op/quantize.cpp
    op/loop_kernel.cpp
    op/lstm.cpp
    op/matmul_bias.cpp
    op/max_pool_with_indices.cpp
-    op/quantized_max_pool.cpp
-    op/quantized_avg_pool.cpp
-    op/quantized_conv_relu.cpp
-    op/quantized_conv_bias.cpp
    op/rnn.cpp
    op/sigmoid_mul.cpp
    op/conv_add.cpp

--- a/src/ngraph/runtime/cpu/builder/dequantize.cpp
+++ b/src/ngraph/runtime/cpu/builder/dequantize.cpp
-//*****************************************************************************
-// Copyright 2017-2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#include "ngraph/runtime/cpu/op/dequantize.hpp"
-#include <vector>
-#include "ngraph/op/constant.hpp"
-#include "ngraph/runtime/cpu/cpu_builder.hpp"
-#include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
-#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
-
-using namespace std;
-using namespace ngraph;
-
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace cpu
-        {
-            template <>
-            void Builder::BUILDER_DECL(ngraph::op::DequantizeCPU)
-            {
-                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
-                {
-                    auto& functors = external_function->get_functors();
-                    auto& arg_tensor = external_function->get_tensor_data(args[0].get_name());
-                    auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
-                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
-
-                    auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
-                    auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
-
-                    size_t dequantize_index =
-                        mkldnn_emitter->build_dequantization(node, input_desc, result_desc);
-
-                    auto& deps = mkldnn_emitter->get_primitive_deps(dequantize_index);
-                    auto functor = [&, dequantize_index](CPURuntimeContext* ctx) {
-                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg_tensor);
-                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, dequantize_index);
-                    };
-                    functors.emplace_back(functor);
-                }
-                else
-                {
-                    throw ngraph_error("unsupported parameters for DequantizeCPUOp via DEX");
-                }
-            }
-            REGISTER_OP_BUILDER(DequantizeCPU);
-        }
-    }
-}
--- a/src/ngraph/runtime/cpu/builder/quantize.cpp
+++ b/src/ngraph/runtime/cpu/builder/quantize.cpp
-//*****************************************************************************
-// Copyright 2017-2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#include <vector>
-
-#include "ngraph/runtime/cpu/cpu_builder.hpp"
-#include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
-#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
-#include "ngraph/runtime/cpu/quantization_util.hpp"
-
-using namespace std;
-using namespace ngraph;
-
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace cpu
-        {
-            template <>
-            void Builder::BUILDER_DECL(ngraph::op::QuantizeCPU)
-            {
-                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
-                {
-                    auto quantize = static_cast<const ngraph::op::QuantizeCPU*>(node);
-                    auto& functors = external_function->get_functors();
-                    auto& arg_tensor = external_function->get_tensor_data(args[0].get_name());
-                    auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
-                    auto& out1_tensor = external_function->get_tensor_data(out[1].get_name());
-                    auto& out2_tensor = external_function->get_tensor_data(out[2].get_name());
-                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
-                    auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
-                    auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
-
-                    vector<float> quant_util; // min_range, max_range & scale.
-                    quantization_util::get_min_max_range(quantize->get_input_min(),
-                                                         quantize->get_input_max(),
-                                                         (quantize->get_quantize_et()).is_signed(),
-                                                         quant_util);
-                    std::vector<float> scales;
-                    scales.push_back(quant_util[2]);
-
-                    size_t quantize_index =
-                        mkldnn_emitter->build_quantize_reorder(input_desc, result_desc, scales);
-                    auto& deps = mkldnn_emitter->get_primitive_deps(quantize_index);
-                    auto functor = [&, quantize_index, quant_util](CPURuntimeContext* ctx) {
-                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg_tensor);
-                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
-                        *(static_cast<float*>(out1_tensor)) = quant_util[0];
-                        *(static_cast<float*>(out2_tensor)) = quant_util[1];
-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, quantize_index);
-                    };
-                    functors.emplace_back(functor);
-                }
-                else
-                {
-                    throw ngraph_error("Unsupported parameters for QuantizeCPUOp via DEX");
-                }
-            }
-            REGISTER_OP_BUILDER(QuantizeCPU);
-        }
-    }
-}
--- a/src/ngraph/runtime/cpu/builder/quantized_avg_pool.cpp
+++ b/src/ngraph/runtime/cpu/builder/quantized_avg_pool.cpp
@@ -14,7 +14,7 @@
 // limitations under the License.
 //*****************************************************************************

-#include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
+#include "ngraph/op/experimental/quantized_avg_pool.hpp"
 #include "ngraph/op/constant.hpp"
 #include "ngraph/runtime/cpu/cpu_builder.hpp"
 #include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
@@ -37,20 +37,15 @@ namespace ngraph
                    auto& functors = external_function->get_functors();
                    auto& arg_tensor = external_function->get_tensor_data(args[0].get_name());
                    auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
-                    auto& out1_tensor = external_function->get_tensor_data(out[1].get_name());
-                    auto& out2_tensor = external_function->get_tensor_data(out[2].get_name());
                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();

-                    vector<float> quant_util;
-                    mkldnn_emitter->build_quantized_avg_pool(node, quant_util);
-                    auto& deps = mkldnn_emitter->get_primitive_deps(quant_util[2]);
+                    size_t qavg_pool_index = mkldnn_emitter->build_quantized_avg_pool(node);
+                    auto& deps = mkldnn_emitter->get_primitive_deps(qavg_pool_index);

-                    auto functor = [&, quant_util](CPURuntimeContext* ctx) {
+                    auto functor = [&, qavg_pool_index](CPURuntimeContext* ctx) {
                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg_tensor);
                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
-                        *(static_cast<float*>(out1_tensor)) = quant_util[0];
-                        *(static_cast<float*>(out2_tensor)) = quant_util[1];
-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, quant_util[2]);
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, qavg_pool_index);
                    };
                    functors.emplace_back(functor);
                }

--- a/src/ngraph/runtime/cpu/builder/quantized_conv.cpp
+++ b/src/ngraph/runtime/cpu/builder/quantized_conv.cpp
@@ -14,13 +14,13 @@
 // limitations under the License.
 //*****************************************************************************

-#include "ngraph/runtime/cpu/op/quantized_conv.hpp"
+#include "ngraph/op/experimental/quantized_conv.hpp"
 #include "ngraph/op/constant.hpp"
+#include "ngraph/op/experimental/quantized_conv_bias.hpp"
+#include "ngraph/op/experimental/quantized_conv_relu.hpp"
 #include "ngraph/runtime/cpu/cpu_builder.hpp"
 #include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
-#include "ngraph/runtime/cpu/op/quantized_conv_bias.hpp"
-#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"

 using namespace std;
 using namespace ngraph;
@@ -36,13 +36,10 @@ namespace ngraph
            {
                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
                {
-                    auto qconvolution = static_cast<const ngraph::op::QuantizedConvolution*>(node);
                    auto& functors = external_function->get_functors();
                    auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name());
                    auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
                    auto& out0_tensor = external_function->get_tensor_data(out[0].get_name());
-                    auto& out1_tensor = external_function->get_tensor_data(out[1].get_name());
-                    auto& out2_tensor = external_function->get_tensor_data(out[2].get_name());

                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();

@@ -50,16 +47,11 @@ namespace ngraph
                        mkldnn_emitter->build_convolution<ngraph::op::QuantizedConvolution>(
                            node, args, out);
                    auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
-                    float min_freezed_output = qconvolution->get_freezed_output_min();
-                    float max_freezed_output = qconvolution->get_freezed_output_max();

-                    auto functor = [&, conv_index, min_freezed_output, max_freezed_output](
-                        CPURuntimeContext* ctx) {
+                    auto functor = [&, conv_index](CPURuntimeContext* ctx) {
                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out0_tensor);
-                        *(static_cast<float*>(out1_tensor)) = min_freezed_output;
-                        *(static_cast<float*>(out2_tensor)) = max_freezed_output;
                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, conv_index);
                    };
                    functors.emplace_back(functor);
@@ -75,14 +67,10 @@ namespace ngraph
            {
                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
                {
-                    auto qconvolution_relu =
-                        static_cast<const ngraph::op::QuantizedConvolutionRelu*>(node);
                    auto& functors = external_function->get_functors();
                    auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name());
                    auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
                    auto& out0_tensor = external_function->get_tensor_data(out[0].get_name());
-                    auto& out1_tensor = external_function->get_tensor_data(out[1].get_name());
-                    auto& out2_tensor = external_function->get_tensor_data(out[2].get_name());

                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();

@@ -90,16 +78,11 @@ namespace ngraph
                        mkldnn_emitter->build_convolution<ngraph::op::QuantizedConvolutionRelu>(
                            node, args, out);
                    auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
-                    float min_freezed_output = qconvolution_relu->get_freezed_output_min();
-                    float max_freezed_output = qconvolution_relu->get_freezed_output_max();

-                    auto functor = [&, conv_index, min_freezed_output, max_freezed_output](
-                        CPURuntimeContext* ctx) {
+                    auto functor = [&, conv_index](CPURuntimeContext* ctx) {
                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out0_tensor);
-                        *(static_cast<float*>(out1_tensor)) = min_freezed_output;
-                        *(static_cast<float*>(out2_tensor)) = max_freezed_output;
                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, conv_index);
                    };
                    functors.emplace_back(functor);
@@ -116,15 +99,11 @@ namespace ngraph
            {
                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
                {
-                    auto qconvolution_bias =
-                        static_cast<const ngraph::op::QuantizedConvolutionBias*>(node);
                    auto& functors = external_function->get_functors();
                    auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name());
                    auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
                    auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name());
                    auto& out0_tensor = external_function->get_tensor_data(out[0].get_name());
-                    auto& out1_tensor = external_function->get_tensor_data(out[1].get_name());
-                    auto& out2_tensor = external_function->get_tensor_data(out[2].get_name());

                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();

@@ -132,17 +111,12 @@ namespace ngraph
                        mkldnn_emitter->build_convolution<ngraph::op::QuantizedConvolutionBias>(
                            node, args, out);
                    auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
-                    float min_freezed_output = qconvolution_bias->get_freezed_output_min();
-                    float max_freezed_output = qconvolution_bias->get_freezed_output_max();

-                    auto functor = [&, conv_index, min_freezed_output, max_freezed_output](
-                        CPURuntimeContext* ctx) {
+                    auto functor = [&, conv_index](CPURuntimeContext* ctx) {
                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], arg2_tensor);
                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[3], out0_tensor);
-                        *(static_cast<float*>(out1_tensor)) = min_freezed_output;
-                        *(static_cast<float*>(out2_tensor)) = max_freezed_output;
                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, conv_index);
                    };
                    functors.emplace_back(functor);

--- a/src/ngraph/runtime/cpu/builder/quantized_max_pool.cpp
+++ b/src/ngraph/runtime/cpu/builder/quantized_max_pool.cpp
@@ -14,7 +14,7 @@
 // limitations under the License.
 //*****************************************************************************

-#include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
+#include "ngraph/op/experimental/quantized_max_pool.hpp"
 #include "ngraph/op/constant.hpp"
 #include "ngraph/runtime/cpu/cpu_builder.hpp"
 #include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
@@ -37,21 +37,16 @@ namespace ngraph
                    auto& functors = external_function->get_functors();
                    auto& arg_tensor = external_function->get_tensor_data(args[0].get_name());
                    auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
-                    auto& out1_tensor = external_function->get_tensor_data(out[1].get_name());
-                    auto& out2_tensor = external_function->get_tensor_data(out[2].get_name());

                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();

-                    vector<float> quant_util;
-                    mkldnn_emitter->build_quantized_max_pool(node, quant_util);
-                    auto& deps = mkldnn_emitter->get_primitive_deps(quant_util[2]);
+                    size_t qmax_pool_index = mkldnn_emitter->build_quantized_max_pool(node);
+                    auto& deps = mkldnn_emitter->get_primitive_deps(qmax_pool_index);

-                    auto functor = [&, quant_util](CPURuntimeContext* ctx) {
+                    auto functor = [&, qmax_pool_index](CPURuntimeContext* ctx) {
                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg_tensor);
                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
-                        *(static_cast<float*>(out1_tensor)) = quant_util[0];
-                        *(static_cast<float*>(out2_tensor)) = quant_util[1];
-                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, quant_util[2]);
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, qmax_pool_index);
                    };
                    functors.emplace_back(functor);
                }

--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -70,6 +70,11 @@
 #include "ngraph/op/dot.hpp"
 #include "ngraph/op/equal.hpp"
 #include "ngraph/op/exp.hpp"
+#include "ngraph/op/experimental/quantized_avg_pool.hpp"
+#include "ngraph/op/experimental/quantized_conv.hpp"
+#include "ngraph/op/experimental/quantized_conv_bias.hpp"
+#include "ngraph/op/experimental/quantized_conv_relu.hpp"
+#include "ngraph/op/experimental/quantized_max_pool.hpp"
 #include "ngraph/op/floor.hpp"
 #include "ngraph/op/function_call.hpp"
 #include "ngraph/op/get_output_element.hpp"
@@ -146,18 +151,11 @@
 #include "ngraph/runtime/cpu/op/conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
 #include "ngraph/runtime/cpu/op/convert_layout.hpp"
-#include "ngraph/runtime/cpu/op/dequantize.hpp"
 #include "ngraph/runtime/cpu/op/group_conv.hpp"
 #include "ngraph/runtime/cpu/op/loop_kernel.hpp"
 #include "ngraph/runtime/cpu/op/lstm.hpp"
 #include "ngraph/runtime/cpu/op/matmul_bias.hpp"
 #include "ngraph/runtime/cpu/op/max_pool_with_indices.hpp"
-#include "ngraph/runtime/cpu/op/quantize.hpp"
-#include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
-#include "ngraph/runtime/cpu/op/quantized_conv.hpp"
-#include "ngraph/runtime/cpu/op/quantized_conv_bias.hpp"
-#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
-#include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
 #include "ngraph/runtime/cpu/op/rnn.hpp"
 #include "ngraph/runtime/cpu/op/sigmoid.hpp"
 #include "ngraph/runtime/cpu/op/sigmoid_mul.hpp"
@@ -304,8 +302,6 @@ static const runtime::cpu::OpMap dispatcher{
    {TI(ngraph::op::Ceiling), &runtime::cpu::CPU_Emitter::emit<op::Ceiling>},
    {TI(ngraph::op::Sqrt), &runtime::cpu::CPU_Emitter::emit<op::Sqrt>},
    {TI(ngraph::op::Convolution), &runtime::cpu::CPU_Emitter::emit<op::Convolution>},
-    {TI(ngraph::op::QuantizeCPU), &runtime::cpu::CPU_Emitter::emit<op::QuantizeCPU>},
-    {TI(ngraph::op::DequantizeCPU), &runtime::cpu::CPU_Emitter::emit<op::DequantizeCPU>},
    {TI(ngraph::op::ConvolutionBackpropFilters),
     &runtime::cpu::CPU_Emitter::emit<op::ConvolutionBackpropFilters>},
    {TI(ngraph::op::ConvolutionBackpropData),

--- a/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
@@ -20,13 +20,12 @@
 #include "mkldnn_emitter.hpp"

 #include "ngraph/op/constant.hpp"
+#include "ngraph/op/experimental/quantized_avg_pool.hpp"
+#include "ngraph/op/experimental/quantized_max_pool.hpp"
 #include "ngraph/runtime/cpu/cpu_layout_descriptor.hpp"
 #include "ngraph/runtime/cpu/cpu_tensor_view_wrapper.hpp"
 #include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
-#include "ngraph/runtime/cpu/op/dequantize.hpp"
-#include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
-#include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
 #include "ngraph/type/element_type.hpp"

 using namespace ngraph::runtime::cpu;
@@ -123,31 +122,7 @@ size_t MKLDNNEmitter::build_memory_primitive(const mkldnn::memory::desc& desc)
    return index;
 }

-size_t MKLDNNEmitter::build_dequantization(const ngraph::Node* node,
-                                           const mkldnn::memory::desc& input_desc,
-                                           const mkldnn::memory::desc& result_desc)
-{
-    auto dequantize = static_cast<const ngraph::op::DequantizeCPU*>(node);
-    auto min_const_op = std::static_pointer_cast<ngraph::op::Constant>(dequantize->get_argument(1));
-    auto max_const_op = std::static_pointer_cast<ngraph::op::Constant>(dequantize->get_argument(2));
-    float min_range = *(static_cast<float const*>(min_const_op->get_data_ptr()));
-    float max_range = *(static_cast<float const*>(max_const_op->get_data_ptr()));
-
-    const float max_abs = std::max(std::abs(min_range), std::abs(max_range));
-    bool is_signed = (dequantize->get_dequantize_et()).is_signed();
-    const float target_range =
-        static_cast<float>((is_signed ? std::pow(2, 7) : std::pow(2, 8)) - 1);
-    const float scale_factor = max_abs / target_range;
-    std::vector<float> scales;
-    scales.push_back(scale_factor);
-
-    size_t dequantize_index = 0;
-    dequantize_index = this->build_quantize_reorder(input_desc, result_desc, scales);
-    return dequantize_index;
-}
-
-void MKLDNNEmitter::build_quantized_max_pool(const ngraph::Node* node,
-                                             std::vector<float>& quant_util)
+size_t MKLDNNEmitter::build_quantized_max_pool(const ngraph::Node* node)
 {
    auto qmax_pool = static_cast<const ngraph::op::QuantizedMaxPool*>(node);
    auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
@@ -159,17 +134,10 @@ void MKLDNNEmitter::build_quantized_max_pool(const ngraph::Node* node,
                                                         qmax_pool->get_window_shape(),
                                                         qmax_pool->get_padding_below(),
                                                         qmax_pool->get_padding_above());
-    auto min_const_op = std::static_pointer_cast<ngraph::op::Constant>(qmax_pool->get_argument(1));
-    auto max_const_op = std::static_pointer_cast<ngraph::op::Constant>(qmax_pool->get_argument(2));
-    float min = *(static_cast<float const*>(min_const_op->get_data_ptr()));
-    float max = *(static_cast<float const*>(max_const_op->get_data_ptr()));
-    quant_util.push_back(min);
-    quant_util.push_back(max);
-    quant_util.push_back(qmax_pool_index);
+    return qmax_pool_index;
 }

-void MKLDNNEmitter::build_quantized_avg_pool(const ngraph::Node* node,
-                                             std::vector<float>& quant_util)
+size_t MKLDNNEmitter::build_quantized_avg_pool(const ngraph::Node* node)
 {
    auto qavg_pool = static_cast<const ngraph::op::QuantizedAvgPool*>(node);
    auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
@@ -184,13 +152,7 @@ void MKLDNNEmitter::build_quantized_avg_pool(const ngraph::Node* node,
                                    qavg_pool->get_window_shape(),
                                    qavg_pool->get_padding_below(),
                                    qavg_pool->get_padding_above());
-    auto min_const_op = std::static_pointer_cast<ngraph::op::Constant>(qavg_pool->get_argument(1));
-    auto max_const_op = std::static_pointer_cast<ngraph::op::Constant>(qavg_pool->get_argument(2));
-    float min = *(static_cast<float const*>(min_const_op->get_data_ptr()));
-    float max = *(static_cast<float const*>(max_const_op->get_data_ptr()));
-    quant_util.push_back(min);
-    quant_util.push_back(max);
-    quant_util.push_back(qavg_pool_index);
+    return qavg_pool_index;
 }

 mkldnn::memory::format MKLDNNEmitter::query_convolution_forward_weight_format(
@@ -787,28 +749,6 @@ size_t MKLDNNEmitter::build_reorder(const mkldnn::memory::desc& input_desc,
    return primitive_index;
 }

-size_t MKLDNNEmitter::build_quantize_reorder(const mkldnn::memory::desc& input_desc,
-                                             const mkldnn::memory::desc& result_desc,
-                                             const std::vector<float>& scales)
-{
-    size_t input_index = build_memory_primitive(input_desc);
-    size_t result_index = build_memory_primitive(result_desc);
-
-    mkldnn::primitive_attr attr;
-    attr.set_output_scales(0, scales);
-    attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
-
-    auto reorder_desc =
-        mkldnn::reorder::primitive_desc({input_desc, mkldnn_utils::global_cpu_engine},
-                                        {result_desc, mkldnn_utils::global_cpu_engine},
-                                        attr);
-
-    size_t primitive_index = insert_primitive(new mkldnn::reorder(
-        reorder_desc, *m_mkldnn_primitives[input_index], *m_mkldnn_primitives[result_index]));
-    m_primitive_deps[primitive_index] = {input_index, result_index};
-    return primitive_index;
-}
-
 size_t MKLDNNEmitter::build_lrn_forward(const mkldnn::memory::desc& input_desc,
                                        const mkldnn::memory::desc& result_desc,
                                        float alpha,

--- a/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
@@ -26,16 +26,15 @@
 #include "ngraph/coordinate_diff.hpp"
 #include "ngraph/node.hpp"
 #include "ngraph/op/convolution.hpp"
+#include "ngraph/op/experimental/quantized_conv.hpp"
+#include "ngraph/op/experimental/quantized_conv_bias.hpp"
+#include "ngraph/op/experimental/quantized_conv_relu.hpp"
 #include "ngraph/runtime/cpu/cpu_tensor_view_wrapper.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
 #include "ngraph/runtime/cpu/op/bounded_relu.hpp"
 #include "ngraph/runtime/cpu/op/conv_add.hpp"
 #include "ngraph/runtime/cpu/op/conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
-#include "ngraph/runtime/cpu/op/quantized_conv.hpp"
-#include "ngraph/runtime/cpu/op/quantized_conv_bias.hpp"
-#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
-#include "ngraph/runtime/cpu/quantization_util.hpp"
 #include "ngraph/shape.hpp"
 #include "ngraph/strides.hpp"
 #include "ngraph/type/element_type.hpp"
@@ -227,8 +226,6 @@ namespace ngraph
                    }
                    else if (std::is_same<OP, ngraph::op::QuantizedConvolution>())
                    {
-                        const float scale =
-                            quantization_util::get_scale<ngraph::op::QuantizedConvolution>(node);
                        return build_quantized_convolution(
                            data_desc,
                            weights_desc,
@@ -237,14 +234,12 @@ namespace ngraph
                            window_dilation_strides_adjusted,
                            convolution->get_padding_below(),
                            convolution->get_padding_above(),
-                            scale,
+                            (dynamic_cast<const ngraph::op::QuantizedConvolution*>(node))
+                                ->get_scale(),
                            ops);
                    }
                    else if (std::is_same<OP, ngraph::op::QuantizedConvolutionRelu>())
                    {
-                        const float scale =
-                            quantization_util::get_scale<ngraph::op::QuantizedConvolutionRelu>(
-                                node);
                        return build_quantized_convolution(
                            data_desc,
                            weights_desc,
@@ -253,15 +248,13 @@ namespace ngraph
                            window_dilation_strides_adjusted,
                            convolution->get_padding_below(),
                            convolution->get_padding_above(),
-                            scale,
+                            (dynamic_cast<const ngraph::op::QuantizedConvolutionRelu*>(node))
+                                ->get_scale(),
                            ops);
                    }
                    else if (std::is_same<OP, ngraph::op::QuantizedConvolutionBias>())
                    {
                        // conv+bias = cvt_to_int8(scale*(dst + bias))
-                        const float scale =
-                            quantization_util::get_scale<ngraph::op::QuantizedConvolutionBias>(
-                                node);
                        auto bias_desc = mkldnn_utils::get_input_mkldnn_md(node, 2);
                        return build_quantized_convolution(
                            data_desc,
@@ -272,7 +265,8 @@ namespace ngraph
                            window_dilation_strides_adjusted,
                            convolution->get_padding_below(),
                            convolution->get_padding_above(),
-                            scale,
+                            (dynamic_cast<const ngraph::op::QuantizedConvolutionBias*>(node))
+                                ->get_scale(),
                            ops);
                    }
                    else
@@ -595,19 +589,9 @@ namespace ngraph
                                          const mkldnn::memory::desc& result_desc,
                                          float alpha);

-                size_t build_quantize_reorder(const mkldnn::memory::desc& input_desc,
-                                              const mkldnn::memory::desc& result_desc,
-                                              const std::vector<float>& scales);
+                size_t build_quantized_max_pool(const ngraph::Node* node);

-                size_t build_dequantization(const ngraph::Node* node,
-                                            const mkldnn::memory::desc& input_desc,
-                                            const mkldnn::memory::desc& result_desc);
-
-                void build_quantized_max_pool(const ngraph::Node* node,
-                                              std::vector<float>& quant_util);
-
-                void build_quantized_avg_pool(const ngraph::Node* node,
-                                              std::vector<float>& quant_util);
+                size_t build_quantized_avg_pool(const ngraph::Node* node);

            private:
                std::vector<mkldnn::primitive*> m_mkldnn_primitives;

--- a/src/ngraph/runtime/cpu/op/dequantize.cpp
+++ b/src/ngraph/runtime/cpu/op/dequantize.cpp
-//*****************************************************************************
-// Copyright 2017-2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#include "ngraph/runtime/cpu/op/dequantize.hpp"
-#include "ngraph/op/constant.hpp"
-
-ngraph::op::DequantizeCPU::DequantizeCPU(std::shared_ptr<Node> input,
-                                         std::shared_ptr<Node> min,
-                                         std::shared_ptr<Node> max,
-                                         const element::Type& type)
-    : Op("DequantizeCPU", check_single_output_args({input, min, max}))
-    , m_element_type(type)
-{
-    constructor_validate_and_infer_types();
-
-    if (input->get_element_type() != element::u8 && input->get_element_type() != element::i8)
-    {
-        throw ngraph_error("Dequantization supported only for i8/u8!");
-    }
-
-    if (min->get_element_type() != min->get_element_type())
-    {
-        throw ngraph_error("Min's element type isn't equal to max's!");
-    }
-
-    if (min->get_shape().size() != 0)
-    {
-        throw ngraph_error("Min is not a scalar!");
-    }
-
-    if (max->get_shape().size() != 0)
-    {
-        throw ngraph_error("Max is not a scalar!");
-    }
-
-    if (!(std::dynamic_pointer_cast<op::Constant>(min) &&
-          std::dynamic_pointer_cast<op::Constant>(max)))
-    {
-        throw ngraph_error("Min and max have to be constants!");
-    }
-
-    set_output_type(0, element::f32, input->get_shape());
-}
-
-std::shared_ptr<ngraph::Node>
-    ngraph::op::DequantizeCPU::copy_with_new_args(const NodeVector& new_args) const
-{
-    if (new_args.size() != 3)
-    {
-        throw ngraph_error("Incorrect number of new arguments");
-    }
-    return std::make_shared<DequantizeCPU>(
-        new_args.at(0), new_args.at(1), new_args.at(2), m_element_type);
-}
--- a/src/ngraph/runtime/cpu/op/dequantize.hpp
+++ b/src/ngraph/runtime/cpu/op/dequantize.hpp
-//*****************************************************************************
-// Copyright 2017-2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#pragma once
-
-#include "ngraph/node.hpp"
-#include "ngraph/node_vector.hpp"
-#include "ngraph/op/op.hpp"
-
-namespace ngraph
-{
-    namespace op
-    {
-        class DequantizeCPU : public Op
-        {
-        public:
-            DequantizeCPU(std::shared_ptr<Node> input,
-                          std::shared_ptr<Node> min,
-                          std::shared_ptr<Node> max,
-                          const element::Type& type);
-            const element::Type& get_dequantize_et() const { return m_element_type; }
-            virtual std::shared_ptr<Node>
-                copy_with_new_args(const NodeVector& new_args) const override;
-
-        private:
-            const element::Type m_element_type;
-        };
-    }
-}
--- a/src/ngraph/runtime/cpu/op/quantize.cpp
+++ b/src/ngraph/runtime/cpu/op/quantize.cpp
-//*****************************************************************************
-// Copyright 2017-2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#include "ngraph/runtime/cpu/op/quantize.hpp"
-#include "ngraph/op/constant.hpp"
-
-ngraph::op::QuantizeCPU::QuantizeCPU(std::shared_ptr<Node> input,
-                                     std::shared_ptr<Node> min,
-                                     std::shared_ptr<Node> max,
-                                     const element::Type& type)
-    : Op("QuantizeCPU", check_single_output_args({input, min, max}))
-    , m_element_type(type)
-{
-    constructor_validate_and_infer_types();
-
-    if (input->get_element_type() != element::f32)
-    {
-        throw ngraph_error("Quantization supported only from float32 --> i8/u8!");
-    }
-
-    if (min->get_element_type() != min->get_element_type())
-    {
-        throw ngraph_error("Min's element type isn't equal to max's!");
-    }
-
-    if (min->get_shape().size() != 0)
-    {
-        throw ngraph_error("Min is not a scalar!");
-    }
-
-    if (max->get_shape().size() != 0)
-    {
-        throw ngraph_error("Max is not a scalar!");
-    }
-
-    if (!(std::dynamic_pointer_cast<op::Constant>(min) &&
-          std::dynamic_pointer_cast<op::Constant>(max)))
-    {
-        throw ngraph_error("Min and max have to be constants!");
-    }
-
-    auto min_const_op = std::static_pointer_cast<ngraph::op::Constant>(min);
-    auto max_const_op = std::static_pointer_cast<ngraph::op::Constant>(max);
-    float input_min_range = *(static_cast<float const*>(min_const_op->get_data_ptr()));
-    float input_max_range = *(static_cast<float const*>(max_const_op->get_data_ptr()));
-    this->m_input_min = input_min_range;
-    this->m_input_max = input_max_range;
-
-    set_output_size(3);
-    set_output_type(0, type, input->get_shape());
-    set_output_type(1, element::f32, Shape{});
-    set_output_type(2, element::f32, Shape{});
-}
-
-std::shared_ptr<ngraph::Node>
-    ngraph::op::QuantizeCPU::copy_with_new_args(const NodeVector& new_args) const
-{
-    if (new_args.size() != 3)
-    {
-        throw ngraph_error("Incorrect number of new arguments");
-    }
-    return std::make_shared<QuantizeCPU>(
-        new_args.at(0), new_args.at(1), new_args.at(2), m_element_type);
-}
--- a/src/ngraph/runtime/cpu/op/quantize.hpp
+++ b/src/ngraph/runtime/cpu/op/quantize.hpp
-//*****************************************************************************
-// Copyright 2017-2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-
-#pragma once
-
-#include "ngraph/node.hpp"
-#include "ngraph/node_vector.hpp"
-#include "ngraph/op/op.hpp"
-
-namespace ngraph
-{
-    namespace op
-    {
-        class QuantizeCPU : public Op
-        {
-        public:
-            QuantizeCPU(std::shared_ptr<Node> input,
-                        std::shared_ptr<Node> min,
-                        std::shared_ptr<Node> max,
-                        const element::Type& type);
-            const element::Type& get_quantize_et() const { return m_element_type; }
-            float get_input_min() const { return m_input_min; }
-            float get_input_max() const { return m_input_max; }
-            virtual std::shared_ptr<Node>
-                copy_with_new_args(const NodeVector& new_args) const override;
-
-        private:
-            const element::Type m_element_type;
-            float m_input_min;
-            float m_input_max;
-        };
-    }
-}
--- a/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
@@ -30,6 +30,11 @@
 #include "ngraph/op/batch_norm.hpp"
 #include "ngraph/op/concat.hpp"
 #include "ngraph/op/convolution.hpp"
+#include "ngraph/op/experimental/quantized_avg_pool.hpp"
+#include "ngraph/op/experimental/quantized_conv.hpp"
+#include "ngraph/op/experimental/quantized_conv_bias.hpp"
+#include "ngraph/op/experimental/quantized_conv_relu.hpp"
+#include "ngraph/op/experimental/quantized_max_pool.hpp"
 #include "ngraph/op/lrn.hpp"
 #include "ngraph/op/max_pool.hpp"
 #include "ngraph/op/relu.hpp"
@@ -43,16 +48,9 @@
 #include "ngraph/runtime/cpu/op/conv_add.hpp"
 #include "ngraph/runtime/cpu/op/conv_bias.hpp"
 #include "ngraph/runtime/cpu/op/conv_relu.hpp"
-#include "ngraph/runtime/cpu/op/dequantize.hpp"
 #include "ngraph/runtime/cpu/op/group_conv.hpp"
 #include "ngraph/runtime/cpu/op/lstm.hpp"
 #include "ngraph/runtime/cpu/op/max_pool_with_indices.hpp"
-#include "ngraph/runtime/cpu/op/quantize.hpp"
-#include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
-#include "ngraph/runtime/cpu/op/quantized_conv.hpp"
-#include "ngraph/runtime/cpu/op/quantized_conv_bias.hpp"
-#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
-#include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
 #include "ngraph/runtime/cpu/op/rnn.hpp"
 #include "ngraph/runtime/cpu/op/sigmoid.hpp"

@@ -733,27 +731,14 @@ namespace ngraph
                    }
                }

-                template <>
-                void CPUAssignment::ASSIGN_DECL(ngraph::op::DequantizeCPU)
-                {
-                    if (node->get_input_element_type(0) == element::u8 ||
-                        node->get_input_element_type(0) == element::i8)
-                    {
-                        auto dequantize = static_cast<op::DequantizeCPU*>(node);
-                        auto op_annotations =
-                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
-                        op_annotations->set_mkldnn_op(true);
-                        dequantize->set_op_annotations(op_annotations);
-                    }
-                }
-
                template <>
                void CPUAssignment::ASSIGN_DECL(ngraph::op::QuantizedConvolution)
                {
+                    auto quantized_conv = static_cast<op::QuantizedConvolution*>(node);
+
                    if (node->get_input_element_type(0) == element::u8 &&
                        node->get_input_element_type(1) == element::i8)
                    {
-                        auto quantized_conv = static_cast<op::QuantizedConvolution*>(node);
                        auto op_annotations =
                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
                        op_annotations->set_mkldnn_op(true);
@@ -764,10 +749,11 @@ namespace ngraph
                template <>
                void CPUAssignment::ASSIGN_DECL(ngraph::op::QuantizedConvolutionRelu)
                {
+                    auto quantized_conv_relu = static_cast<op::QuantizedConvolutionRelu*>(node);
+
                    if (node->get_input_element_type(0) == element::u8 &&
                        node->get_input_element_type(1) == element::i8)
                    {
-                        auto quantized_conv_relu = static_cast<op::QuantizedConvolutionRelu*>(node);
                        auto op_annotations =
                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
                        op_annotations->set_mkldnn_op(true);
@@ -775,26 +761,14 @@ namespace ngraph
                    }
                }

-                template <>
-                void CPUAssignment::ASSIGN_DECL(ngraph::op::QuantizeCPU)
-                {
-                    if (node->get_input_element_type(0) == element::f32)
-                    {
-                        auto quantize = static_cast<op::QuantizeCPU*>(node);
-                        auto op_annotations =
-                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
-                        op_annotations->set_mkldnn_op(true);
-                        quantize->set_op_annotations(op_annotations);
-                    }
-                }
-
                template <>
                void CPUAssignment::ASSIGN_DECL(ngraph::op::QuantizedConvolutionBias)
                {
+                    auto quantized_conv_bias = static_cast<op::QuantizedConvolutionBias*>(node);
+
                    if (node->get_input_element_type(0) == element::u8 &&
                        node->get_input_element_type(1) == element::i8)
                    {
-                        auto quantized_conv_bias = static_cast<op::QuantizedConvolutionBias*>(node);
                        auto op_annotations =
                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
                        op_annotations->set_mkldnn_op(true);
@@ -861,14 +835,10 @@ static const runtime::cpu::pass::AssignOpMap s_dispatcher{
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::QuantizedAvgPool>},
    {TI(ngraph::op::Softmax), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Softmax>},
    {TI(ngraph::op::Slice), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Slice>},
-    {TI(ngraph::op::QuantizeCPU),
-     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::QuantizeCPU>},
    {TI(ngraph::op::ReplaceSlice),
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ReplaceSlice>},
    {TI(ngraph::op::ConvolutionAdd),
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionAdd>},
-    {TI(ngraph::op::DequantizeCPU),
-     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::DequantizeCPU>},
    {TI(ngraph::op::QuantizedConvolutionRelu),
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::QuantizedConvolutionRelu>},
    {TI(ngraph::op::QuantizedConvolutionBias),

--- a/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
--- a/src/ngraph/runtime/cpu/quantization_util.hpp
+++ b/src/ngraph/runtime/cpu/quantization_util.hpp
-//*****************************************************************************
-// Copyright 2017-2018 Intel Corporation
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//*****************************************************************************
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#pragma once
-
-#include <limits>
-#include <vector>
-#include "ngraph/node.hpp"
-#include "ngraph/op/constant.hpp"
-#include "ngraph/runtime/cpu/op/quantize.hpp"
-#include "ngraph/util.hpp"
-
-namespace ngraph
-{
-    namespace runtime
-    {
-        namespace cpu
-        {
-            namespace quantization_util
-            {
-                template <class T1, class T2, class T3>
-                void quantization_range_for_multiplication(
-                    float min_a, float max_a, float min_b, float max_b, float* min_c, float* max_c)
-                {
-                    // begin code copied and pasted (and modified) from
-                    // github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/quantization_utils.h
-                    float a_one_quant_level = (max_a - min_a) / (std::numeric_limits<T1>::max() -
-                                                                 std::numeric_limits<T1>::min());
-                    float b_one_quant_level = (max_b - min_b) / (std::numeric_limits<T2>::max() -
-                                                                 std::numeric_limits<T2>::min());
-                    float c_one_quant_level = a_one_quant_level * b_one_quant_level;
-                    *min_c = c_one_quant_level * std::numeric_limits<T3>::min();
-                    *max_c = c_one_quant_level * std::numeric_limits<T3>::max();
-                    // end code copied and pasted (and modified) from
-                    // github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/quantization_utils.h
-                }
-
-                static inline void get_min_max_range(float input_min_range,
-                                                     float input_max_range,
-                                                     bool is_signed,
-                                                     std::vector<float>& quant_util)
-                {
-                    // begin code copied and pasted from
-                    // github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/quantize_op.cc
-                    float min_range;
-                    float max_range;
-                    // If input_min_range and input_max_range are close,
-                    // introduce a slightly larger delta between them.
-                    min_range = std::min(0.0f, input_min_range);
-                    const float epsilon =
-                        std::max(1.0f, std::max(fabsf(input_min_range), fabsf(input_max_range))) /
-                        100.0f;
-                    max_range = std::max(input_max_range, min_range + epsilon);
-                    max_range = std::max(0.0f, max_range);
-                    // end code copied and pasted from
-                    // github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/quantize_op.cc
-                    const float max_abs = std::max(std::abs(min_range), std::abs(max_range));
-                    const float target_range =
-                        static_cast<float>((is_signed ? std::pow(2, 7) : std::pow(2, 8)) - 1);
-                    max_range = max_abs;
-                    min_range = is_signed ? -max_abs : 0;
-                    const float scale = target_range / max_abs;
-                    quant_util.push_back(min_range);
-                    quant_util.push_back(max_range);
-                    quant_util.push_back(scale);
-                }
-
-                template <typename OP>
-                float get_scale(const ngraph::Node* node)
-                {
-                    auto qconvolution = static_cast<const OP*>(node);
-                    float min_out_value;
-                    float max_out_value;
-                    quantization_range_for_multiplication<uint8_t, int8_t, int32_t>(
-                        qconvolution->get_input_min(),
-                        qconvolution->get_input_max(),
-                        qconvolution->get_filter_min(),
-                        qconvolution->get_filter_max(),
-                        &min_out_value,
-                        &max_out_value);
-                    const float max_abs32 =
-                        std::max(std::abs(min_out_value), std::abs(max_out_value));
-                    const float max_abs8 =
-                        std::max(std::abs(qconvolution->get_freezed_output_min()),
-                                 std::abs(qconvolution->get_freezed_output_max()));
-                    // Output is signed int.
-                    // s32 = f32 * std::pow(2, 31)/ max_abs32;
-                    // s8 = f32 * std::pow(2, 7)/ max_abs8;
-                    // s8 = s32 * std::pow(2, -24) * max_abs32 / max_abs8;
-                    const float scale = static_cast<float>(
-                        (std::pow(2, -24) * static_cast<double>(max_abs32 / max_abs8)));
-                    return scale;
-                }
-            }
-        }
-    }
-}
--- a/test/quantize_cpu.cpp
+++ b/test/quantize_cpu.cpp