graph builders for quantize scale (#1976)

* quantize scale passing unit tests * epsilon bump * finished with quantization scale * unit tests passing with convolution scale as builder * broadcasted constants and cleanup * api consistency for quant builders * code style * cleanup * newline at EOF * use requantization_scale * drop TF license as we are no longer using TF code directly

graph builders for quantize scale (#1976)
* quantize scale passing unit tests * epsilon bump * finished with quantization scale * unit tests passing with convolution scale as builder * broadcasted constants and cleanup * api consistency for quant builders * code style * cleanup * newline at EOF * use requantization_scale * drop TF license as we are no longer using TF code directly
8bd3846f · Adam Straw · Robert Kimball · 2a26558a · 8bd3846f · 8bd3846f
Commit 8bd3846f authored Nov 07, 2018 by Adam Straw Committed by Robert Kimball Nov 07, 2018
13 changed files
--- a/src/ngraph/builder/make_constant.hpp
+++ b/src/ngraph/builder/make_constant.hpp
+//*****************************************************************************
+// Copyright 2017-2018 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include "ngraph/node.hpp"
+#include "ngraph/op/broadcast.hpp"
+#include "ngraph/op/constant.hpp"
+namespace ngraph
+{
+    namespace builder
+    {
+        template <class T>
+        std::shared_ptr<Node>
+            make_constant(const element::Type& type, const Shape& shape, const T& num)
+        {
+            std::shared_ptr<Node> val = nullptr;
+            if (type == element::f32)
+            {
+                val = std::make_shared<ngraph::op::Constant>(
+                    type, ngraph::Shape{}, std::vector<float>{static_cast<float>(num)});
+            }
+            else if (type == element::f64)
+            {
+                val = std::make_shared<ngraph::op::Constant>(
+                    type, ngraph::Shape{}, std::vector<double>{static_cast<double>(num)});
+            }
+            else if (type == element::i64)
+            {
+                val = std::make_shared<ngraph::op::Constant>(
+                    type, ngraph::Shape{}, std::vector<int64_t>{static_cast<int64_t>(num)});
+            }
+            else if (type == element::i32)
+            {
+                val = std::make_shared<ngraph::op::Constant>(
+                    type, ngraph::Shape{}, std::vector<int32_t>{static_cast<int32_t>(num)});
+            }
+            else if (type == element::i16)
+            {
+                val = std::make_shared<ngraph::op::Constant>(
+                    type, ngraph::Shape{}, std::vector<int16_t>{static_cast<int16_t>(num)});
+            }
+            else if (type == element::i8)
+            {
+                val = std::make_shared<ngraph::op::Constant>(
+                    type, ngraph::Shape{}, std::vector<int8_t>{static_cast<int8_t>(num)});
+            }
+            else if (type == element::u64)
+            {
+                val = std::make_shared<ngraph::op::Constant>(
+                    type, ngraph::Shape{}, std::vector<uint64_t>{static_cast<uint64_t>(num)});
+            }
+            else if (type == element::u32)
+            {
+                val = std::make_shared<ngraph::op::Constant>(
+                    type, ngraph::Shape{}, std::vector<uint32_t>{static_cast<uint32_t>(num)});
+            }
+            else if (type == element::u16)
+            {
+                val = std::make_shared<ngraph::op::Constant>(
+                    type, ngraph::Shape{}, std::vector<uint16_t>{static_cast<uint16_t>(num)});
+            }
+            else if (type == element::u8)
+            {
+                val = std::make_shared<ngraph::op::Constant>(
+                    type, ngraph::Shape{}, std::vector<uint8_t>{static_cast<uint8_t>(num)});
+            }
+            else
+            {
+                throw ngraph_error("make_constant: Unsupported element type");
+            }
+            if (shape.size() > 0)
+            {
+                ngraph::AxisSet axes;
+                for (size_t i = 0; i < shape.size(); i++)
+                {
+                    axes.insert(i);
+                }
+                val = std::make_shared<ngraph::op::Broadcast>(val, shape, axes);
+            }
+            return val;
+        }
+    }
+}
--- a/src/ngraph/builder/quantization.cpp
+++ b/src/ngraph/builder/quantization.cpp
@@ -16,6 +16,7 @@
 #include <memory>
+#include "ngraph/builder/make_constant.hpp"
 #include "ngraph/builder/quantization.hpp"
 #include "ngraph/op/constant.hpp"
 #include "quantization_util.hpp"
@@ -30,73 +31,72 @@ namespace ngraph
        std::shared_ptr<Node> ScaledQuantize(std::shared_ptr<Node> input,
                                             std::shared_ptr<Node> min,
                                             std::shared_ptr<Node> max,
-                                             const ngraph::element::Type& type,
+                                             const ngraph::element::Type& quant_type,
                                             const ngraph::AxisSet& axes,
                                             op::Quantize::RoundMode round_mode)
        {
-            auto offset = op::Constant::create(type, Shape{}, {0});
+            auto real_type = input->get_element_type();
-            if (input->get_element_type() == element::f32)
+            if (min->get_element_type() != real_type)
            {
-                float scale =
+                throw ngraph_error("ScaledQuantize: min must match input type");
-                    builder::quantization_util::get_quantization_scale<float>(min, max, type, true);
-                auto quantize_scale =
-                    op::Constant::create(input->get_element_type(), Shape{}, {scale});
-                return make_shared<op::Quantize>(
-                    input, quantize_scale, offset, type, axes, round_mode);
            }
-            else if (input->get_element_type() == element::f64)
+            if (max->get_element_type() != real_type)
            {
-                double scale = builder::quantization_util::get_quantization_scale<double>(
+                throw ngraph_error("ScaledQuantize: max must match input type");
-                    min, max, type, true);
-                auto quantize_scale =
-                    op::Constant::create(input->get_element_type(), Shape{}, {scale});
-                return make_shared<op::Quantize>(
-                    input, quantize_scale, offset, type, axes, round_mode);
            }
-            else
+            auto shape = min->get_shape();
+            if (shape != max->get_shape())
            {
-                throw ngraph_error("Unsupported quantization element type");
+                throw ngraph_error("ScaledQuantize: min and max must have same shape");
            }
+            auto zero = make_constant(quant_type, shape, 0);
+            auto scale = quantization_util::get_scale(min, max, quant_type, true);
+            return make_shared<op::Quantize>(input, scale, zero, quant_type, axes, round_mode);
        }
        std::shared_ptr<Node> ScaledDequantize(std::shared_ptr<Node> input,
                                               std::shared_ptr<Node> min,
                                               std::shared_ptr<Node> max,
-                                               const ngraph::element::Type& type,
+                                               const ngraph::element::Type& real_type,
                                               const ngraph::AxisSet& axes)
        {
-            auto input_et = input->get_element_type();
+            auto quant_type = input->get_element_type();
-            auto offset = op::Constant::create(input_et, Shape{}, {0});
-            if (type == element::f32)
+            if (min->get_element_type() != real_type)
            {
-                float scale =
+                throw ngraph_error("ScaledDequantize: min must match output type");
-                    builder::quantization_util::get_quantization_scale<float>(min, max, input_et);
-                auto dequantize_scale = op::Constant::create(type, Shape{}, {scale});
-                return make_shared<op::Dequantize>(input, dequantize_scale, offset, type, axes);
            }
-            else if (type == element::f64)
+            if (max->get_element_type() != real_type)
            {
-                double scale =
+                throw ngraph_error("ScaledDequantize: max must match output type");
-                    builder::quantization_util::get_quantization_scale<double>(min, max, input_et);
-                auto dequantize_scale = op::Constant::create(type, Shape{}, {scale});
-                return make_shared<op::Dequantize>(input, dequantize_scale, offset, type, axes);
            }
-            else
+            auto shape = min->get_shape();
+            if (shape != max->get_shape())
            {
-                throw ngraph_error("Unsupported dequantization element type");
+                throw ngraph_error("ScaledDequantize: min and max must have same shape");
            }
+            auto zero = make_constant(quant_type, shape, 0);
+            auto scale = quantization_util::get_scale(min, max, quant_type);
+            return make_shared<op::Dequantize>(input, scale, zero, real_type, axes);
        }
-        std::shared_ptr<Node> ScaledQuantizedAvgPool(const std::shared_ptr<Node>& arg,
+        std::shared_ptr<Node> ScaledQuantizedAvgPool(std::shared_ptr<Node> input,
                                                     const Shape& window_shape,
                                                     const Strides& window_movement_strides,
                                                     const Shape& padding_below,
                                                     const Shape& padding_above,
                                                     bool include_padding_in_avg_computation,
-                                                     const std::shared_ptr<Node> min,
+                                                     std::shared_ptr<Node> min,
-                                                     const std::shared_ptr<Node> max)
+                                                     std::shared_ptr<Node> max)
        {
-            return make_shared<op::QuantizedAvgPool>(arg,
+            return make_shared<op::QuantizedAvgPool>(input,
                                                     window_shape,
                                                     window_movement_strides,
                                                     padding_below,
@@ -105,31 +105,30 @@ namespace ngraph
        }
        std::shared_ptr<Node>
-            ScaledQuantizedConvolutionBias(const std::shared_ptr<Node>& data_batch,
+            ScaledQuantizedConvolutionBias(std::shared_ptr<Node> input,
-                                           const std::shared_ptr<Node>& filters,
+                                           std::shared_ptr<Node> filters,
-                                           const std::shared_ptr<Node>& bias,
+                                           std::shared_ptr<Node> bias,
                                           const Strides& window_movement_strides,
                                           const Strides& window_dilation_strides,
                                           const CoordinateDiff& padding_below,
                                           const CoordinateDiff& padding_above,
                                           const Strides& data_dilation_strides,
-                                           const std::shared_ptr<Node> min_input,
+                                           std::shared_ptr<Node> min_input,
-                                           const std::shared_ptr<Node> max_input,
+                                           std::shared_ptr<Node> max_input,
-                                           const std::shared_ptr<Node> min_filter,
+                                           std::shared_ptr<Node> min_filter,
-                                           const std::shared_ptr<Node> max_filter,
+                                           std::shared_ptr<Node> max_filter,
-                                           const std::shared_ptr<Node> min_freezed_output,
+                                           std::shared_ptr<Node> min_freezed_output,
-                                           const std::shared_ptr<Node> max_freezed_output,
+                                           std::shared_ptr<Node> max_freezed_output,
                                           const bool with_relu)
        {
-            float scale = builder::quantization_util::get_scale(min_input,
+            auto requantization_scale = quantization_util::get_scale(min_input,
-                                                                max_input,
+                                                                     max_input,
-                                                                min_filter,
+                                                                     min_filter,
-                                                                max_filter,
+                                                                     max_filter,
-                                                                min_freezed_output,
+                                                                     min_freezed_output,
-                                                                max_freezed_output);
+                                                                     max_freezed_output);
-            auto requantization_scale = op::Constant::create(element::f32, Shape{1}, {scale});
+            return make_shared<op::QuantizedConvolutionBias>(input,
-            return make_shared<op::QuantizedConvolutionBias>(data_batch,
                                                             filters,
                                                             bias,
                                                             window_movement_strides,
@@ -142,28 +141,28 @@ namespace ngraph
        }
        std::shared_ptr<Node>
-            ScaledQuantizedConvolutionRelu(const std::shared_ptr<Node>& data_batch,
+            ScaledQuantizedConvolutionRelu(std::shared_ptr<Node> input,
-                                           const std::shared_ptr<Node>& filters,
+                                           std::shared_ptr<Node> filters,
                                           const Strides& window_movement_strides,
                                           const Strides& window_dilation_strides,
                                           const CoordinateDiff& padding_below,
                                           const CoordinateDiff& padding_above,
                                           const Strides& data_dilation_strides,
-                                           const std::shared_ptr<Node> min_input,
+                                           std::shared_ptr<Node> min_input,
-                                           const std::shared_ptr<Node> max_input,
+                                           std::shared_ptr<Node> max_input,
-                                           const std::shared_ptr<Node> min_filter,
+                                           std::shared_ptr<Node> min_filter,
-                                           const std::shared_ptr<Node> max_filter,
+                                           std::shared_ptr<Node> max_filter,
-                                           const std::shared_ptr<Node> min_freezed_output,
+                                           std::shared_ptr<Node> min_freezed_output,
-                                           const std::shared_ptr<Node> max_freezed_output)
+                                           std::shared_ptr<Node> max_freezed_output)
        {
-            float scale = builder::quantization_util::get_scale(min_input,
+            auto requantization_scale = quantization_util::get_scale(min_input,
-                                                                max_input,
+                                                                     max_input,
-                                                                min_filter,
+                                                                     min_filter,
-                                                                max_filter,
+                                                                     max_filter,
-                                                                min_freezed_output,
+                                                                     min_freezed_output,
-                                                                max_freezed_output);
+                                                                     max_freezed_output);
-            auto requantization_scale = op::Constant::create(element::f32, Shape{1}, {scale});
-            return make_shared<op::QuantizedConvolutionRelu>(data_batch,
+            return make_shared<op::QuantizedConvolutionRelu>(input,
                                                             filters,
                                                             window_movement_strides,
                                                             window_dilation_strides,
@@ -173,29 +172,28 @@ namespace ngraph
                                                             requantization_scale);
        }
-        std::shared_ptr<Node>
+        std::shared_ptr<Node> ScaledQuantizedConvolution(std::shared_ptr<Node> input,
-            ScaledQuantizedConvolution(const std::shared_ptr<Node>& data_batch,
+                                                         std::shared_ptr<Node> filters,
-                                       const std::shared_ptr<Node>& filters,
+                                                         const Strides& window_movement_strides,
-                                       const Strides& window_movement_strides,
+                                                         const Strides& window_dilation_strides,
-                                       const Strides& window_dilation_strides,
+                                                         const CoordinateDiff& padding_below,
-                                       const CoordinateDiff& padding_below,
+                                                         const CoordinateDiff& padding_above,
-                                       const CoordinateDiff& padding_above,
+                                                         const Strides& data_dilation_strides,
-                                       const Strides& data_dilation_strides,
+                                                         std::shared_ptr<Node> min_input,
-                                       const std::shared_ptr<Node> min_input,
+                                                         std::shared_ptr<Node> max_input,
-                                       const std::shared_ptr<Node> max_input,
+                                                         std::shared_ptr<Node> min_filter,
-                                       const std::shared_ptr<Node> min_filter,
+                                                         std::shared_ptr<Node> max_filter,
-                                       const std::shared_ptr<Node> max_filter,
+                                                         std::shared_ptr<Node> min_freezed_output,
-                                       const std::shared_ptr<Node> min_freezed_output,
+                                                         std::shared_ptr<Node> max_freezed_output)
-                                       const std::shared_ptr<Node> max_freezed_output)
        {
-            float scale = builder::quantization_util::get_scale(min_input,
+            auto requantization_scale = quantization_util::get_scale(min_input,
-                                                                max_input,
+                                                                     max_input,
-                                                                min_filter,
+                                                                     min_filter,
-                                                                max_filter,
+                                                                     max_filter,
-                                                                min_freezed_output,
+                                                                     min_freezed_output,
-                                                                max_freezed_output);
+                                                                     max_freezed_output);
-            auto requantization_scale = op::Constant::create(element::f32, Shape{1}, {scale});
-            return make_shared<op::QuantizedConvolution>(data_batch,
+            return make_shared<op::QuantizedConvolution>(input,
                                                         filters,
                                                         window_movement_strides,
                                                         window_dilation_strides,
@@ -205,16 +203,16 @@ namespace ngraph
                                                         requantization_scale);
        }
-        std::shared_ptr<Node> ScaledQuantizedMaxPool(const std::shared_ptr<Node>& arg,
+        std::shared_ptr<Node> ScaledQuantizedMaxPool(std::shared_ptr<Node> input,
                                                     const Shape& window_shape,
                                                     const Strides& window_movement_strides,
                                                     const Shape& padding_below,
                                                     const Shape& padding_above,
-                                                     const std::shared_ptr<Node> min,
+                                                     std::shared_ptr<Node> min,
-                                                     const std::shared_ptr<Node> max)
+                                                     std::shared_ptr<Node> max)
        {
            return make_shared<op::QuantizedMaxPool>(
-                arg, window_shape, window_movement_strides, padding_below, padding_above);
+                input, window_shape, window_movement_strides, padding_below, padding_above);
        }
    }
 }
--- a/src/ngraph/builder/quantization.hpp
+++ b/src/ngraph/builder/quantization.hpp
@@ -43,68 +43,67 @@ namespace ngraph
                                               const ngraph::element::Type& type,
                                               const ngraph::AxisSet& axes);
-        std::shared_ptr<Node> ScaledQuantizedAvgPool(const std::shared_ptr<Node>& arg,
+        std::shared_ptr<Node> ScaledQuantizedAvgPool(std::shared_ptr<Node> input,
                                                     const Shape& window_shape,
                                                     const Strides& window_movement_strides,
                                                     const Shape& padding_below,
                                                     const Shape& padding_above,
                                                     bool include_padding_in_avg_computation,
-                                                     const std::shared_ptr<Node> min,
+                                                     std::shared_ptr<Node> min,
-                                                     const std::shared_ptr<Node> max);
+                                                     std::shared_ptr<Node> max);
        std::shared_ptr<Node>
-            ScaledQuantizedConvolutionBias(const std::shared_ptr<Node>& data_batch,
+            ScaledQuantizedConvolutionBias(std::shared_ptr<Node> input,
-                                           const std::shared_ptr<Node>& filters,
+                                           std::shared_ptr<Node> filters,
-                                           const std::shared_ptr<Node>& bias,
+                                           std::shared_ptr<Node> bias,
                                           const Strides& window_movement_strides,
                                           const Strides& window_dilation_strides,
                                           const CoordinateDiff& padding_below,
                                           const CoordinateDiff& padding_above,
                                           const Strides& data_dilation_strides,
-                                           const std::shared_ptr<Node> min_input,
+                                           std::shared_ptr<Node> min_input,
-                                           const std::shared_ptr<Node> max_input,
+                                           std::shared_ptr<Node> max_input,
-                                           const std::shared_ptr<Node> min_filter,
+                                           std::shared_ptr<Node> min_filter,
-                                           const std::shared_ptr<Node> max_filter,
+                                           std::shared_ptr<Node> max_filter,
-                                           const std::shared_ptr<Node> min_freezed_output,
+                                           std::shared_ptr<Node> min_freezed_output,
-                                           const std::shared_ptr<Node> max_freezed_output,
+                                           std::shared_ptr<Node> max_freezed_output,
                                           const bool with_relu = false);
        std::shared_ptr<Node>
-            ScaledQuantizedConvolutionRelu(const std::shared_ptr<Node>& data_batch,
+            ScaledQuantizedConvolutionRelu(std::shared_ptr<Node> input,
-                                           const std::shared_ptr<Node>& filters,
+                                           std::shared_ptr<Node> filters,
                                           const Strides& window_movement_strides,
                                           const Strides& window_dilation_strides,
                                           const CoordinateDiff& padding_below,
                                           const CoordinateDiff& padding_above,
                                           const Strides& data_dilation_strides,
-                                           const std::shared_ptr<Node> min_input,
+                                           std::shared_ptr<Node> min_input,
-                                           const std::shared_ptr<Node> max_input,
+                                           std::shared_ptr<Node> max_input,
-                                           const std::shared_ptr<Node> min_filter,
+                                           std::shared_ptr<Node> min_filter,
-                                           const std::shared_ptr<Node> max_filter,
+                                           std::shared_ptr<Node> max_filter,
-                                           const std::shared_ptr<Node> min_freezed_output,
+                                           std::shared_ptr<Node> min_freezed_output,
-                                           const std::shared_ptr<Node> max_freezed_output);
+                                           std::shared_ptr<Node> max_freezed_output);
-        std::shared_ptr<Node>
+        std::shared_ptr<Node> ScaledQuantizedConvolution(std::shared_ptr<Node> input,
-            ScaledQuantizedConvolution(const std::shared_ptr<Node>& data_batch,
+                                                         std::shared_ptr<Node> filters,
-                                       const std::shared_ptr<Node>& filters,
+                                                         const Strides& window_movement_strides,
-                                       const Strides& window_movement_strides,
+                                                         const Strides& window_dilation_strides,
-                                       const Strides& window_dilation_strides,
+                                                         const CoordinateDiff& padding_below,
-                                       const CoordinateDiff& padding_below,
+                                                         const CoordinateDiff& padding_above,
-                                       const CoordinateDiff& padding_above,
+                                                         const Strides& data_dilation_strides,
-                                       const Strides& data_dilation_strides,
+                                                         std::shared_ptr<Node> min_input,
-                                       const std::shared_ptr<Node> min_input,
+                                                         std::shared_ptr<Node> max_input,
-                                       const std::shared_ptr<Node> max_input,
+                                                         std::shared_ptr<Node> min_filter,
-                                       const std::shared_ptr<Node> min_filter,
+                                                         std::shared_ptr<Node> max_filter,
-                                       const std::shared_ptr<Node> max_filter,
+                                                         std::shared_ptr<Node> min_freezed_output,
-                                       const std::shared_ptr<Node> min_freezed_output,
+                                                         std::shared_ptr<Node> max_freezed_output);
-                                       const std::shared_ptr<Node> max_freezed_output);
-        std::shared_ptr<Node> ScaledQuantizedMaxPool(const std::shared_ptr<Node>& arg,
+        std::shared_ptr<Node> ScaledQuantizedMaxPool(std::shared_ptr<Node> input,
                                                     const Shape& window_shape,
                                                     const Strides& window_movement_strides,
                                                     const Shape& padding_below,
                                                     const Shape& padding_above,
-                                                     const std::shared_ptr<Node> min,
+                                                     std::shared_ptr<Node> min,
-                                                     const std::shared_ptr<Node> max);
+                                                     std::shared_ptr<Node> max);
    }
 }
--- a/src/ngraph/builder/quantization_util.hpp
+++ b/src/ngraph/builder/quantization_util.hpp
@@ -13,25 +13,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //*****************************************************************************
-//*******************************************************************************
-//  Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-//  Licensed under the Apache License, Version 2.0 (the "License");
-//  you may not use this file except in compliance with the License.
-//  You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software
-//  distributed under the License is distributed on an "AS IS" BASIS,
-//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-//  See the License for the specific language governing permissions and
-//  limitations under the License.
-//==============================================================================
 #pragma once
 #include <limits>
 #include <vector>
+#include "ngraph/builder/make_constant.hpp"
 #include "ngraph/node.hpp"
+#include "ngraph/op/abs.hpp"
+#include "ngraph/op/add.hpp"
+#include "ngraph/op/broadcast.hpp"
 #include "ngraph/op/constant.hpp"
+#include "ngraph/op/divide.hpp"
+#include "ngraph/op/maximum.hpp"
+#include "ngraph/op/minimum.hpp"
+#include "ngraph/op/multiply.hpp"
+#include "ngraph/op/subtract.hpp"
 #include "ngraph/util.hpp"
 namespace ngraph
@@ -40,117 +37,141 @@ namespace ngraph
    {
        namespace quantization_util
        {
-            template <class T1, class T2, class T3>
+            std::shared_ptr<Node> max_abs(std::shared_ptr<Node> a, std::shared_ptr<Node> b)
-            void quantization_range_for_multiplication(
-                float min_a, float max_a, float min_b, float max_b, float* min_c, float* max_c)
            {
-                // begin code copied and pasted (and modified) from
+                auto abs_a = std::make_shared<op::Abs>(a);
-                // github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/quantization_utils.h
+                auto abs_b = std::make_shared<op::Abs>(b);
-                float a_one_quant_level = (max_a - min_a) / (std::numeric_limits<T1>::max() -
+                return std::make_shared<op::Maximum>(abs_a, abs_b);
-                                                             std::numeric_limits<T1>::min());
-                float b_one_quant_level = (max_b - min_b) / (std::numeric_limits<T2>::max() -
-                                                             std::numeric_limits<T2>::min());
-                float c_one_quant_level = a_one_quant_level * b_one_quant_level;
-                *min_c = c_one_quant_level * std::numeric_limits<T3>::min();
-                *max_c = c_one_quant_level * std::numeric_limits<T3>::max();
-                // end code copied and pasted (and modified) from
-                // github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/quantization_utils.h
            }
-            float get_scale(const std::shared_ptr<Node> min_input,
+            std::pair<std::shared_ptr<Node>, std::shared_ptr<Node>>
-                            const std::shared_ptr<Node> max_input,
+                quantization_range_for_multiplication(std::shared_ptr<Node> min_a,
-                            const std::shared_ptr<Node> min_filter,
+                                                      std::shared_ptr<Node> max_a,
-                            const std::shared_ptr<Node> max_filter,
+                                                      std::shared_ptr<Node> min_b,
-                            const std::shared_ptr<Node> min_freezed_output,
+                                                      std::shared_ptr<Node> max_b)
-                            const std::shared_ptr<Node> max_freezed_output)
            {
-                auto min_input_const_op = std::static_pointer_cast<ngraph::op::Constant>(min_input);
+                auto type = min_a->get_element_type();
-                auto max_input_const_op = std::static_pointer_cast<ngraph::op::Constant>(max_input);
+                if (type != max_a->get_element_type() || type != min_b->get_element_type() ||
-                auto min_filter_const_op =
+                    type != max_b->get_element_type())
-                    std::static_pointer_cast<ngraph::op::Constant>(min_filter);
+                {
-                auto max_filter_const_op =
+                    throw ngraph_error(
-                    std::static_pointer_cast<ngraph::op::Constant>(max_filter);
+                        "quantization_range_for_multiplication: min and max must have same type");
-                auto min_freezed_output_const_op =
+                }
-                    std::static_pointer_cast<ngraph::op::Constant>(min_freezed_output);
-                auto max_freezed_output_const_op =
+                auto shape = min_a->get_shape();
-                    std::static_pointer_cast<ngraph::op::Constant>(max_freezed_output);
+                if (shape != max_a->get_shape() || shape != min_b->get_shape() ||
-                auto input_min = min_input_const_op->get_vector<float>();
+                    shape != max_b->get_shape())
-                auto input_max = max_input_const_op->get_vector<float>();
+                {
-                auto filter_min = min_filter_const_op->get_vector<float>();
+                    throw ngraph_error(
-                auto filter_max = max_filter_const_op->get_vector<float>();
+                        "quantization_range_for_multiplication: min and max must have same shape");
-                auto output_min = min_freezed_output_const_op->get_vector<float>();
+                }
-                auto output_max = max_freezed_output_const_op->get_vector<float>();
+                auto u8_range = make_constant(type,
-                float min_out_value;
+                                              shape,
-                float max_out_value;
+                                              std::numeric_limits<uint8_t>::max() -
-                quantization_range_for_multiplication<uint8_t, int8_t, int32_t>(input_min[0],
+                                                  std::numeric_limits<uint8_t>::min());
-                                                                                input_max[0],
+                auto i8_range = make_constant(type,
-                                                                                filter_min[0],
+                                              shape,
-                                                                                filter_max[0],
+                                              std::numeric_limits<int8_t>::max() -
-                                                                                &min_out_value,
+                                                  std::numeric_limits<int8_t>::min());
-                                                                                &max_out_value);
-                const float max_abs32 = std::max(std::abs(min_out_value), std::abs(max_out_value));
+                auto a_one_quant_level = (max_a - min_a) / u8_range;
-                const float max_abs8 = std::max(std::abs(output_min[0]), std::abs(output_max[0]));
+                auto b_one_quant_level = (max_b - min_b) / i8_range;
+                auto c_one_quant_level = a_one_quant_level * b_one_quant_level;
+                auto i32_min = make_constant(type, shape, std::numeric_limits<int32_t>::min());
+                auto i32_max = make_constant(type, shape, std::numeric_limits<int32_t>::max());
+                auto min_c = c_one_quant_level * i32_min;
+                auto max_c = c_one_quant_level * i32_max;
+                return std::pair<std::shared_ptr<Node>, std::shared_ptr<Node>>(min_c, max_c);
+            }
+            std::shared_ptr<Node> get_scale(std::shared_ptr<Node> min_input,
+                                            std::shared_ptr<Node> max_input,
+                                            std::shared_ptr<Node> min_filter,
+                                            std::shared_ptr<Node> max_filter,
+                                            std::shared_ptr<Node> min_freezed_output,
+                                            std::shared_ptr<Node> max_freezed_output)
+            {
+                auto type = min_input->get_element_type();
+                if (type != max_input->get_element_type() ||
+                    type != min_filter->get_element_type() ||
+                    type != max_filter->get_element_type() ||
+                    type != min_freezed_output->get_element_type() ||
+                    type != max_freezed_output->get_element_type())
+                {
+                    throw ngraph_error("get_scale: min and max must have same type");
+                }
+                auto shape = min_input->get_shape();
+                if (shape != max_input->get_shape() || shape != min_filter->get_shape() ||
+                    shape != max_filter->get_shape() || shape != min_freezed_output->get_shape() ||
+                    shape != max_freezed_output->get_shape())
+                {
+                    throw ngraph_error("get_scale: min and max must have same shape");
+                }
+                auto ranges = quantization_range_for_multiplication(
+                    min_input, max_input, min_filter, max_filter);
+                auto min_out_value = ranges.first;
+                auto max_out_value = ranges.second;
+                auto max_abs32 = max_abs(min_out_value, max_out_value);
+                auto max_abs8 = max_abs(min_freezed_output, max_freezed_output);
                // Output is signed int.
                // s32 = f32 * std::pow(2, 31)/ max_abs32;
                // s8 = f32 * std::pow(2, 7)/ max_abs8;
                // s8 = s32 * std::pow(2, -24) * max_abs32 / max_abs8;
-                const float scale = static_cast<float>(
+                return make_constant(type, shape, std::pow(2, -24)) * (max_abs32 / max_abs8);
-                    (std::pow(2, -24) * static_cast<double>(max_abs32 / max_abs8)));
-                return scale;
            }
-            template <typename T>
+            std::shared_ptr<Node> get_scale(std::shared_ptr<Node> input_min_range,
-            static inline T get_quantization_scale(const std::shared_ptr<Node> min_input,
+                                            std::shared_ptr<Node> input_max_range,
-                                                   const std::shared_ptr<Node> max_input,
+                                            const ngraph::element::Type& quant_type,
-                                                   const ngraph::element::Type& type,
+                                            bool bump_by_eps = false)
-                                                   bool bump_by_eps = false)
            {
-                auto min_input_const_op =
+                auto type = input_min_range->get_element_type();
-                    std::dynamic_pointer_cast<ngraph::op::Constant>(min_input);
+                if (type != input_max_range->get_element_type())
-                auto max_input_const_op =
-                    std::dynamic_pointer_cast<ngraph::op::Constant>(max_input);
-                if (min_input_const_op == nullptr)
                {
-                    throw ngraph_error("min input must be constant");
+                    throw ngraph_error("get_scale: min and max must have same type");
                }
-                else if (max_input_const_op == nullptr)
+                auto shape = input_min_range->get_shape();
+                if (shape != input_max_range->get_shape())
                {
-                    throw ngraph_error("max input must be constant");
+                    throw ngraph_error("get_scale: min and max must have same shape");
                }
-                auto input_min_range = min_input_const_op->get_vector<T>();
+                auto min_range = input_min_range;
-                auto input_max_range = max_input_const_op->get_vector<T>();
+                auto max_range = input_max_range;
-                T min_range = std::numeric_limits<T>::min();
-                T max_range = std::numeric_limits<T>::max();
                if (bump_by_eps)
                {
-                    // If input_min_range and input_max_range are close,
+                    auto zero = make_constant(type, shape, 0);
-                    // introduce a slightly larger delta between them.
+                    min_range = std::make_shared<op::Minimum>(zero, input_min_range);
-                    min_range = std::min(static_cast<T>(0.0f), input_min_range[0]);
-                    const T epsilon = std::max(static_cast<T>(1.0f),
+                    auto max_abs_input_range = max_abs(input_min_range, input_max_range);
-                                               static_cast<T>(std::max(fabs(input_min_range[0]),
-                                                                       fabs(input_max_range[0])))) /
+                    auto one = make_constant(type, shape, 1);
-                                      static_cast<T>(100.0f);
+                    auto hundred = make_constant(type, shape, 100);
-                    max_range = std::max(input_max_range[0], min_range + epsilon);
+                    auto epsilon =
-                    max_range = std::max(static_cast<T>(0.0f), max_range);
+                        std::make_shared<op::Maximum>(one, max_abs_input_range) / hundred;
-                    // end code copied and pasted from
-                    // github.com/tensorflow/tensorflow/blob/master/tensorflow/core/kernels/quantize_op.cc
+                    max_range = std::make_shared<op::Maximum>(input_max_range, min_range + epsilon);
-                }
+                    max_range = std::make_shared<op::Maximum>(zero, max_range);
-                else
-                {
-                    min_range = input_min_range[0];
-                    max_range = input_max_range[0];
                }
-                const T max_abs = std::max(std::abs(min_range), std::abs(max_range));
+                size_t bw = quant_type.bitwidth();
-                const T bitwidth = type.bitwidth();
+                float range = static_cast<float>(
-                const T target_range = static_cast<T>(
+                    (quant_type.is_signed() ? std::pow(2, (bw - 1)) : std::pow(2, bw)) - 1);
-                    (type.is_signed() ? std::pow(2, (bitwidth - 1)) : std::pow(2, bitwidth)) - 1);
-                const T scale_factor = max_abs / target_range;
+                auto max_abs_range = max_abs(min_range, max_range);
-                return scale_factor;
+                auto target_range = make_constant(type, shape, range);
+                return max_abs_range / target_range;
            }
        }
    }

--- a/src/ngraph/op/experimental/quantized_conv.cpp
+++ b/src/ngraph/op/experimental/quantized_conv.cpp
@@ -45,11 +45,6 @@ op::QuantizedConvolution::QuantizedConvolution(const shared_ptr<Node>& data_batc
    auto& data_batch_shape = data_batch->get_shape();
    auto& filters_shape = filters->get_shape();
-    auto scale_const_op = std::static_pointer_cast<ngraph::op::Constant>(scale);
-    auto scale_val = scale_const_op->get_vector<float>();
-    this->m_scale = scale_val[0];
    set_output_type(0,
                    element::i8,
                    util::infer_convolution_output_shape(this,

--- a/src/ngraph/op/experimental/quantized_conv.hpp
+++ b/src/ngraph/op/experimental/quantized_conv.hpp
@@ -41,7 +41,6 @@ namespace ngraph
            const Strides& get_data_dilation_strides() const { return m_data_dilation_strides; }
            std::shared_ptr<Node> get_filters() { return get_argument(1); }
            std::shared_ptr<Node> get_data_batch() { return get_argument(0); }
-            float get_scale() const { return m_scale; }
            virtual std::shared_ptr<Node>
                copy_with_new_args(const NodeVector& new_args) const override;
@@ -51,7 +50,6 @@ namespace ngraph
            CoordinateDiff m_padding_below;
            CoordinateDiff m_padding_above;
            Strides m_data_dilation_strides;
-            float m_scale;
        };
    }
 }
--- a/src/ngraph/op/experimental/quantized_conv_bias.cpp
+++ b/src/ngraph/op/experimental/quantized_conv_bias.cpp
@@ -49,10 +49,6 @@ op::QuantizedConvolutionBias::QuantizedConvolutionBias(const shared_ptr<Node>& d
    auto& data_batch_shape = data_batch->get_shape();
    auto& filters_shape = filters->get_shape();
-    auto scale_const_op = std::static_pointer_cast<ngraph::op::Constant>(scale);
-    auto scale_val = scale_const_op->get_vector<float>();
-    this->m_scale = scale_val[0];
    // TODO: call ngraph util
    // util::validate_convbias_shapes(data_batch_shape, filters_shape, bias->get_shape());

--- a/src/ngraph/op/experimental/quantized_conv_bias.hpp
+++ b/src/ngraph/op/experimental/quantized_conv_bias.hpp
@@ -47,7 +47,6 @@ namespace ngraph
            const CoordinateDiff& get_padding_below() const { return m_padding_below; }
            const CoordinateDiff& get_padding_above() const { return m_padding_above; }
            const Strides& get_data_dilation_strides() const { return m_data_dilation_strides; }
-            float get_scale() const { return m_scale; }
            std::shared_ptr<Node> get_bias() { return get_argument(2); }
            std::shared_ptr<Node> get_filters() { return get_argument(1); }
            std::shared_ptr<Node> get_data_batch() { return get_argument(0); }
@@ -62,7 +61,6 @@ namespace ngraph
            CoordinateDiff m_padding_above;
            Strides m_data_dilation_strides;
            bool m_with_relu;
-            float m_scale;
        };
    }
 }
--- a/src/ngraph/op/experimental/quantized_conv_relu.cpp
+++ b/src/ngraph/op/experimental/quantized_conv_relu.cpp
@@ -44,10 +44,6 @@ op::QuantizedConvolutionRelu::QuantizedConvolutionRelu(const std::shared_ptr<Nod
    auto& data_batch_shape = data_batch->get_shape();
    auto& filters_shape = filters->get_shape();
-    auto scale_const_op = std::static_pointer_cast<ngraph::op::Constant>(scale);
-    float scale_val = *(static_cast<float const*>(scale_const_op->get_data_ptr()));
-    this->m_scale = scale_val;
    set_output_type(0,
                    element::u8,
                    util::infer_convolution_output_shape(this,

--- a/src/ngraph/op/experimental/quantized_conv_relu.hpp
+++ b/src/ngraph/op/experimental/quantized_conv_relu.hpp
@@ -43,7 +43,6 @@ namespace ngraph
            const CoordinateDiff& get_padding_below() const { return m_padding_below; }
            const CoordinateDiff& get_padding_above() const { return m_padding_above; }
            const Strides& get_data_dilation_strides() const { return m_data_dilation_strides; }
-            float get_scale() const { return m_scale; }
            std::shared_ptr<Node> get_filters() { return get_argument(1); }
            std::shared_ptr<Node> get_data_batch() { return get_argument(0); }
            virtual std::shared_ptr<Node>
@@ -55,7 +54,6 @@ namespace ngraph
            CoordinateDiff m_padding_below;
            CoordinateDiff m_padding_above;
            Strides m_data_dilation_strides;
-            float m_scale;
        };
    }
 }
--- a/src/ngraph/pass/constant_folding.cpp
+++ b/src/ngraph/pass/constant_folding.cpp
@@ -30,6 +30,7 @@
 #include "ngraph/op/negative.hpp"
 #include "ngraph/op/pad.hpp"
 #include "ngraph/op/quantize.hpp"
+#include "ngraph/op/relu.hpp"
 #include "ngraph/op/reshape.hpp"
 #include "ngraph/op/subtract.hpp"
 #include "ngraph/pattern/matcher.hpp"
@@ -45,6 +46,7 @@
 #include "ngraph/runtime/reference/negate.hpp"
 #include "ngraph/runtime/reference/pad.hpp"
 #include "ngraph/runtime/reference/quantize.hpp"
+#include "ngraph/runtime/reference/relu.hpp"
 #include "ngraph/runtime/reference/reshape.hpp"
 #include "ngraph/runtime/reference/subtract.hpp"
@@ -378,7 +380,8 @@ void ngraph::pass::ConstantFolding::construct_constant_binary()
 bool is_supported_unary_op(std::shared_ptr<Node> n)
 {
-    return std::dynamic_pointer_cast<op::Abs>(n) || std::dynamic_pointer_cast<op::Negative>(n);
+    return std::dynamic_pointer_cast<op::Abs>(n) || std::dynamic_pointer_cast<op::Negative>(n) ||
+           std::dynamic_pointer_cast<op::Relu>(n);
 }
 template <class T>
@@ -398,6 +401,11 @@ shared_ptr<op::Constant> make_constant_unary(shared_ptr<op::Constant> constant,
        runtime::reference::negate<T>(
            constant->get_vector<T>().data(), out_vec.data(), shape_size(out_shape));
    }
+    else if (std::dynamic_pointer_cast<op::Relu>(unary))
+    {
+        runtime::reference::relu<T>(
+            constant->get_vector<T>().data(), out_vec.data(), shape_size(out_shape));
+    }
    else
    {
        NGRAPH_ASSERT(false) << "must be consistent with is_supported_unary_op";

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -118,6 +118,7 @@
 #include "ngraph/op/topk.hpp"
 #include "ngraph/pass/algebraic_simplification.hpp"
 #include "ngraph/pass/common_function_collection.hpp"
+#include "ngraph/pass/constant_folding.hpp"
 #include "ngraph/pass/core_fusion.hpp"
 #include "ngraph/pass/cse.hpp"
 #include "ngraph/pass/dump_sorted.hpp"
@@ -1049,6 +1050,7 @@ void runtime::cpu::CPU_ExternalFunction::register_common_passes(ngraph::pass::Ma
    NodeVector nv_cwi; // We dont need CPUWorkspaceInsertion to return list of indices
    pass_manager.register_pass<runtime::cpu::pass::CPUWorkspaceInsertion>(nv_cwi, false);
    pass_manager.register_pass<runtime::cpu::pass::CPUAssignment>(this);
+    pass_manager.register_pass<ngraph::pass::ConstantFolding>();
    pass_manager.register_pass<runtime::cpu::pass::CPULayout>(this);
    pass_manager.register_pass<ngraph::pass::CommonSubexpressionElimination>(
        runtime::cpu::get_cse_handlers_map());

--- a/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
@@ -25,6 +25,7 @@
 #include "ngraph/coordinate_diff.hpp"
 #include "ngraph/node.hpp"
+#include "ngraph/op/constant.hpp"
 #include "ngraph/op/convolution.hpp"
 #include "ngraph/op/experimental/quantized_conv.hpp"
 #include "ngraph/op/experimental/quantized_conv_bias.hpp"
@@ -226,6 +227,16 @@ namespace ngraph
                    }
                    else if (std::is_same<OP, ngraph::op::QuantizedConvolution>())
                    {
+                        auto qc = dynamic_cast<const ngraph::op::QuantizedConvolution*>(node);
+                        auto scale_const_op =
+                            std::dynamic_pointer_cast<ngraph::op::Constant>(qc->get_arguments()[2]);
+                        if (scale_const_op == nullptr)
+                        {
+                            throw ngraph_error("QuantizedConvolution scale must be a Constant");
+                        }
+                        auto scale_val = scale_const_op->get_vector<float>();
                        return build_quantized_convolution(
                            data_desc,
                            weights_desc,
@@ -234,12 +245,21 @@ namespace ngraph
                            window_dilation_strides_adjusted,
                            convolution->get_padding_below(),
                            convolution->get_padding_above(),
-                            (dynamic_cast<const ngraph::op::QuantizedConvolution*>(node))
+                            scale_val[0],
-                                ->get_scale(),
                            ops);
                    }
                    else if (std::is_same<OP, ngraph::op::QuantizedConvolutionRelu>())
                    {
+                        auto qcr = dynamic_cast<const ngraph::op::QuantizedConvolutionRelu*>(node);
+                        auto scale_const_op = std::dynamic_pointer_cast<ngraph::op::Constant>(
+                            qcr->get_arguments()[2]);
+                        if (scale_const_op == nullptr)
+                        {
+                            throw ngraph_error("QuantizedConvolutionRelu scale must be a Constant");
+                        }
+                        auto scale_val = scale_const_op->get_vector<float>();
                        return build_quantized_convolution(
                            data_desc,
                            weights_desc,
@@ -248,12 +268,21 @@ namespace ngraph
                            window_dilation_strides_adjusted,
                            convolution->get_padding_below(),
                            convolution->get_padding_above(),
-                            (dynamic_cast<const ngraph::op::QuantizedConvolutionRelu*>(node))
+                            scale_val[0],
-                                ->get_scale(),
                            ops);
                    }
                    else if (std::is_same<OP, ngraph::op::QuantizedConvolutionBias>())
                    {
+                        auto qcb = dynamic_cast<const ngraph::op::QuantizedConvolutionBias*>(node);
+                        auto scale_const_op = std::dynamic_pointer_cast<ngraph::op::Constant>(
+                            qcb->get_arguments()[3]);
+                        if (scale_const_op == nullptr)
+                        {
+                            throw ngraph_error("QuantizedConvolutionBias scale must be a Constant");
+                        }
+                        auto scale_val = scale_const_op->get_vector<float>();
                        // conv+bias = cvt_to_int8(scale*(dst + bias))
                        auto bias_desc = mkldnn_utils::get_input_mkldnn_md(node, 2);
                        return build_quantized_convolution(
@@ -265,8 +294,7 @@ namespace ngraph
                            window_dilation_strides_adjusted,
                            convolution->get_padding_below(),
                            convolution->get_padding_above(),
-                            (dynamic_cast<const ngraph::op::QuantizedConvolutionBias*>(node))
+                            scale_val[0],
-                                ->get_scale(),
                            ops);
                    }
                    else