Quantize(reorder) bias to int32 (#1933)

* Quantize the bias to int32 * Bias scale fix * mnist works * Quantize Bias * Introduce Quantize op in the graph to quantize bias & feedback * Comments and some refactoring * Add test case with float bias and enable int32 as quantized type in ngraph * Change shape of scale from Shape{} to Shape{1} in the backend

Quantize(reorder) bias to int32 (#1933)
* Quantize the bias to int32 * Bias scale fix * mnist works * Quantize Bias * Introduce Quantize op in the graph to quantize bias & feedback * Comments and some refactoring * Add test case with float bias and enable int32 as quantized type in ngraph * Change shape of scale from Shape{} to Shape{1} in the backend
296ee2cf · Nishant Patel · Scott Cyphers · 71cc8bbf · 296ee2cf · 296ee2cf
Commit 296ee2cf authored Nov 12, 2018 by Nishant Patel Committed by Scott Cyphers Nov 12, 2018
9 changed files
--- a/src/ngraph/builder/quantization.cpp
+++ b/src/ngraph/builder/quantization.cpp
@@ -121,13 +121,27 @@ namespace ngraph
                                           std::shared_ptr<Node> max_freezed_output,
                                           const bool with_relu)
        {
+            auto output_et = with_relu ? element::u8 : element::i8;
            auto requantization_scale = quantization_util::get_scale(min_input,
                                                                     max_input,
                                                                     min_filter,
                                                                     max_filter,
                                                                     min_freezed_output,
-                                                                     max_freezed_output);
+                                                                     max_freezed_output,
+                                                                     output_et);

+            if (bias->get_element_type() != element::i32)
+            {
+                auto zero = make_constant(element::i32, min_input->get_shape(), 0);
+                AxisSet quantization_axes;
+                auto bias_scale =
+                    quantization_util::get_bias_scale(min_input, max_input, min_filter, max_filter);
+                op::Quantize::RoundMode round_mode =
+                    op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_EVEN;
+
+                bias = make_shared<op::Quantize>(
+                    bias, bias_scale, zero, element::i32, quantization_axes, round_mode);
+            }
            return make_shared<op::QuantizedConvolutionBias>(input,
                                                             filters,
                                                             bias,
@@ -160,7 +174,8 @@ namespace ngraph
                                                                     min_filter,
                                                                     max_filter,
                                                                     min_freezed_output,
-                                                                     max_freezed_output);
+                                                                     max_freezed_output,
+                                                                     element::u8);

            return make_shared<op::QuantizedConvolutionRelu>(input,
                                                             filters,
@@ -191,7 +206,8 @@ namespace ngraph
                                                                     min_filter,
                                                                     max_filter,
                                                                     min_freezed_output,
-                                                                     max_freezed_output);
+                                                                     max_freezed_output,
+                                                                     element::i8);

            return make_shared<op::QuantizedConvolution>(input,
                                                         filters,

--- a/src/ngraph/builder/quantization_util.hpp
+++ b/src/ngraph/builder/quantization_util.hpp
@@ -92,7 +92,8 @@ namespace ngraph
                                            std::shared_ptr<Node> min_filter,
                                            std::shared_ptr<Node> max_filter,
                                            std::shared_ptr<Node> min_freezed_output,
-                                            std::shared_ptr<Node> max_freezed_output)
+                                            std::shared_ptr<Node> max_freezed_output,
+                                            const ngraph::element::Type& output_type)
            {
                auto type = min_input->get_element_type();
                if (type != max_input->get_element_type() ||
@@ -121,11 +122,59 @@ namespace ngraph
                auto max_abs32 = max_abs(min_out_value, max_out_value);
                auto max_abs8 = max_abs(min_freezed_output, max_freezed_output);

-                // Output is signed int.
-                // s32 = f32 * std::pow(2, 31)/ max_abs32;
-                // s8 = f32 * std::pow(2, 7)/ max_abs8;
-                // s8 = s32 * std::pow(2, -24) * max_abs32 / max_abs8;
-                return make_constant(type, shape, std::pow(2, -24)) * (max_abs32 / max_abs8);
+                // The output of int8 convolution is accumalated in int32.
+                // Mkldnn needs a scale to requantize the output back to {u}int8 based on
+                // if relu is fused or not.
+
+                // Equation to go from f32 to s32. std::pow(2, 31)/ max_abs32 can be thought of
+                // as the scale used for the quantization..
+                // 1. s32 = f32 * std::pow(2, 31)/ max_abs32;
+
+                // Equation to go from f32 to u8.
+                // 2. u8 = f32 * std::pow(2, 8)/ max_abs8;
+
+                // Equation to go from f32 to s8.
+                // 3. s8 = f32 * std::pow(2, 7)/ max_abs8;
+
+                // Replacing f32 from eq 1 in eq 2.
+                // 4. u8 = s32 * std::pow(2, -23) * max_abs32 / max_abs8;
+
+                // Replacing f32 from eq 1 in eq 3.
+                // 5. s8 = s32 * std::pow(2, -24) * max_abs32 / max_abs8;
+
+                return make_constant(
+                           type, shape, std::pow(2, (output_type == element::i8) ? -24 : -23)) *
+                       (max_abs32 / max_abs8);
+            }
+
+            std::shared_ptr<Node> get_bias_scale(std::shared_ptr<Node> min_input,
+                                                 std::shared_ptr<Node> max_input,
+                                                 std::shared_ptr<Node> min_filter,
+                                                 std::shared_ptr<Node> max_filter)
+            {
+                auto type = min_input->get_element_type();
+                if (type != max_input->get_element_type() ||
+                    type != min_filter->get_element_type() ||
+                    type != max_filter->get_element_type())
+                {
+                    throw ngraph_error("get_bias_scale: min and max must have same type");
+                }
+
+                auto shape = min_input->get_shape();
+                if (shape != max_input->get_shape() || shape != min_filter->get_shape() ||
+                    shape != max_filter->get_shape())
+                {
+                    throw ngraph_error("get_bias_scale: min and max must have same shape");
+                }
+
+                auto max_abs_input_range = max_abs(min_input, max_input);
+                auto max_abs_filter_range = max_abs(min_filter, max_filter);
+                auto range = make_constant(type,
+                                           shape,
+                                           std::numeric_limits<uint8_t>::max() *
+                                               std::numeric_limits<int8_t>::max());
+
+                return range / (max_abs_input_range * max_abs_filter_range);
            }

            std::shared_ptr<Node> get_scale(std::shared_ptr<Node> input_min_range,

--- a/src/ngraph/runtime/cpu/builder/quantized_conv.cpp
+++ b/src/ngraph/runtime/cpu/builder/quantized_conv.cpp
@@ -108,7 +108,6 @@ namespace ngraph
                    auto& out0_tensor = external_function->get_tensor_data(out[0].get_name());

                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
-
                    auto conv_index =
                        mkldnn_emitter->build_convolution<ngraph::op::QuantizedConvolutionBias>(
                            node, args, out);

--- a/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
@@ -254,6 +254,7 @@ size_t MKLDNNEmitter::build_convolution_forward(const mkldnn::memory::desc& inpu
              mkldnn::memory::dims(padding_below.begin(), padding_below.end()),
              mkldnn::memory::dims(padding_above.begin(), padding_above.end()),
              mkldnn::padding_kind::zero},
+
             conv_attr,
             executor::global_cpu_engine},
            *m_mkldnn_primitives[input_data_index],
@@ -271,15 +272,16 @@ size_t MKLDNNEmitter::build_convolution_forward(const mkldnn::memory::desc& inpu
    return conv_index;
 }

-size_t MKLDNNEmitter::build_quantized_convolution(const mkldnn::memory::desc& input_data_desc,
-                                                  const mkldnn::memory::desc& weights_desc,
-                                                  const mkldnn::memory::desc& result_desc,
-                                                  const ngraph::Strides& strides,
-                                                  const ngraph::Strides& dilation_strides,
-                                                  const ngraph::CoordinateDiff& padding_below,
-                                                  const ngraph::CoordinateDiff& padding_above,
-                                                  const float scale,
-                                                  const mkldnn::post_ops& pops)
+size_t
+    MKLDNNEmitter::build_quantized_convolution_forward(const mkldnn::memory::desc& input_data_desc,
+                                                       const mkldnn::memory::desc& weights_desc,
+                                                       const mkldnn::memory::desc& result_desc,
+                                                       const ngraph::Strides& strides,
+                                                       const ngraph::Strides& dilation_strides,
+                                                       const ngraph::CoordinateDiff& padding_below,
+                                                       const ngraph::CoordinateDiff& padding_above,
+                                                       const float scale,
+                                                       const mkldnn::post_ops& pops)
 {
    size_t input_data_index = build_memory_primitive(input_data_desc);
    size_t weights_index = build_memory_primitive(weights_desc);
@@ -312,60 +314,49 @@ size_t MKLDNNEmitter::build_quantized_convolution(const mkldnn::memory::desc& in
    return conv_index;
 }

-size_t MKLDNNEmitter::build_quantized_convolution(const mkldnn::memory::desc& input_data_desc,
-                                                  const mkldnn::memory::desc& weights_desc,
-                                                  const mkldnn::memory::desc& bias_desc,
-                                                  const mkldnn::memory::desc& result_desc,
-                                                  const ngraph::Strides& strides,
-                                                  const ngraph::Strides& dilation_strides,
-                                                  const ngraph::CoordinateDiff& padding_below,
-                                                  const ngraph::CoordinateDiff& padding_above,
-                                                  const float scale,
-                                                  const mkldnn::post_ops& pops)
+size_t
+    MKLDNNEmitter::build_quantized_convolution_forward(const mkldnn::memory::desc& input_data_desc,
+                                                       const mkldnn::memory::desc& weights_desc,
+                                                       const mkldnn::memory::desc& bias_desc,
+                                                       const mkldnn::memory::desc& result_desc,
+                                                       const ngraph::Strides& strides,
+                                                       const ngraph::Strides& dilation_strides,
+                                                       const ngraph::CoordinateDiff& padding_below,
+                                                       const ngraph::CoordinateDiff& padding_above,
+                                                       const float scale,
+                                                       const mkldnn::post_ops& pops)
 {
    size_t input_data_index = build_memory_primitive(input_data_desc);
    size_t weights_index = build_memory_primitive(weights_desc);
    size_t bias_index = build_memory_primitive(bias_desc);
    size_t result_index = build_memory_primitive(result_desc);
-
    std::vector<float> output_scale;
    output_scale.push_back(scale);
-
    mkldnn::primitive_attr conv_attr;
    conv_attr.set_post_ops(pops);
    /* Specify the rounding mode */
    conv_attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
    /* Specify the scales array and corresponding mask */
    conv_attr.set_output_scales(0, output_scale);
-
-    size_t conv_index = 0;
-    try
-    {
-        conv_index = insert_primitive(new mkldnn::convolution_forward(
-            {{mkldnn::prop_kind::forward,
-              mkldnn::algorithm::convolution_direct,
-              input_data_desc,
-              weights_desc,
-              bias_desc,
-              result_desc,
-              mkldnn::memory::dims(strides.begin(), strides.end()),
-              mkldnn::memory::dims(dilation_strides.begin(), dilation_strides.end()),
-              mkldnn::memory::dims(padding_below.begin(), padding_below.end()),
-              mkldnn::memory::dims(padding_above.begin(), padding_above.end()),
-              mkldnn::padding_kind::zero},
-             conv_attr,
-             executor::global_cpu_engine},
-            *m_mkldnn_primitives[input_data_index],
-            *m_mkldnn_primitives[weights_index],
-            *m_mkldnn_primitives[bias_index],
-            *m_mkldnn_primitives[result_index]));
-
-        m_primitive_deps[conv_index] = {input_data_index, weights_index, bias_index, result_index};
-    }
-    catch (const mkldnn::error& e)
-    {
-        throw ngraph_error("Could not create convolution " + e.message);
-    }
+    size_t conv_index = insert_primitive(new mkldnn::convolution_forward(
+        {{mkldnn::prop_kind::forward,
+          mkldnn::algorithm::convolution_direct,
+          input_data_desc,
+          weights_desc,
+          bias_desc,
+          result_desc,
+          mkldnn::memory::dims(strides.begin(), strides.end()),
+          mkldnn::memory::dims(dilation_strides.begin(), dilation_strides.end()),
+          mkldnn::memory::dims(padding_below.begin(), padding_below.end()),
+          mkldnn::memory::dims(padding_above.begin(), padding_above.end()),
+          mkldnn::padding_kind::zero},
+         conv_attr,
+         executor::global_cpu_engine},
+        *m_mkldnn_primitives[input_data_index],
+        *m_mkldnn_primitives[weights_index],
+        *m_mkldnn_primitives[bias_index],
+        *m_mkldnn_primitives[result_index]));
+    m_primitive_deps[conv_index] = {input_data_index, weights_index, bias_index, result_index};
    return conv_index;
 }


--- a/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
@@ -108,31 +108,28 @@ namespace ngraph
                                                 const ngraph::CoordinateDiff& padding_above,
                                                 const mkldnn::post_ops& pops = mkldnn::post_ops());

-                size_t
-                    build_quantized_convolution(const mkldnn::memory::desc& input_data_desc,
-                                                const mkldnn::memory::desc& weights_desc,
-                                                const mkldnn::memory::desc& result_desc,
-                                                const ngraph::Strides& strides,
-                                                const ngraph::Strides& dilation_strides,
-                                                const ngraph::CoordinateDiff& padding_below,
-                                                const ngraph::CoordinateDiff& padding_above,
-                                                const float scale,
-                                                const mkldnn::post_ops& pops = mkldnn::post_ops());
+                size_t build_quantized_convolution_forward(
+                    const mkldnn::memory::desc& input_data_desc,
+                    const mkldnn::memory::desc& weights_desc,
+                    const mkldnn::memory::desc& result_desc,
+                    const ngraph::Strides& strides,
+                    const ngraph::Strides& dilation_strides,
+                    const ngraph::CoordinateDiff& padding_below,
+                    const ngraph::CoordinateDiff& padding_above,
+                    const float scale,
+                    const mkldnn::post_ops& pops = mkldnn::post_ops());

-                /**
-                 * QuantizedConvolution + bias forward
-                 */
-                size_t
-                    build_quantized_convolution(const mkldnn::memory::desc& input_data_desc,
-                                                const mkldnn::memory::desc& weights_desc,
-                                                const mkldnn::memory::desc& bias_desc,
-                                                const mkldnn::memory::desc& result_desc,
-                                                const ngraph::Strides& strides,
-                                                const ngraph::Strides& dilation_strides,
-                                                const ngraph::CoordinateDiff& padding_below,
-                                                const ngraph::CoordinateDiff& padding_above,
-                                                const float scale,
-                                                const mkldnn::post_ops& pops = mkldnn::post_ops());
+                size_t build_quantized_convolution_forward(
+                    const mkldnn::memory::desc& input_data_desc,
+                    const mkldnn::memory::desc& weights_desc,
+                    const mkldnn::memory::desc& bias_desc,
+                    const mkldnn::memory::desc& result_desc,
+                    const ngraph::Strides& strides,
+                    const ngraph::Strides& dilation_strides,
+                    const ngraph::CoordinateDiff& padding_below,
+                    const ngraph::CoordinateDiff& padding_above,
+                    const float scale,
+                    const mkldnn::post_ops& pops = mkldnn::post_ops());

                template <typename OP>
                size_t build_convolution(const ngraph::Node* node,
@@ -237,7 +234,7 @@ namespace ngraph

                        auto scale_val = scale_const_op->get_vector<float>();

-                        return build_quantized_convolution(
+                        return build_quantized_convolution_forward(
                            data_desc,
                            weights_desc,
                            result_desc,
@@ -260,7 +257,7 @@ namespace ngraph

                        auto scale_val = scale_const_op->get_vector<float>();

-                        return build_quantized_convolution(
+                        return build_quantized_convolution_forward(
                            data_desc,
                            weights_desc,
                            result_desc,
@@ -285,7 +282,7 @@ namespace ngraph

                        // conv+bias = cvt_to_int8(scale*(dst + bias))
                        auto bias_desc = mkldnn_utils::get_input_mkldnn_md(node, 2);
-                        return build_quantized_convolution(
+                        return build_quantized_convolution_forward(
                            data_desc,
                            weights_desc,
                            bias_desc,

--- a/src/ngraph/runtime/cpu/mkldnn_utils.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.cpp
@@ -282,6 +282,10 @@ mkldnn::memory::desc runtime::cpu::mkldnn_utils::create_default_mkldnn_md(
        et = runtime::cpu::mkldnn_utils::get_mkldnn_data_type(node->get_input_element_type(index));
    }

+    if (shape == Shape{})
+    {
+        shape = Shape{1};
+    }
    return memory::desc(memory::dims(shape.begin(), shape.end()), et, format);
 }


--- a/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
@@ -93,6 +93,10 @@ shared_ptr<Node> runtime::cpu::pass::CPULayout::insert_input_conversions(
        auto tv = output.get_tensor_ptr();
        auto tvl = dynamic_pointer_cast<runtime::cpu::LayoutDescriptor>(tv->get_tensor_layout());

+        if (input.get_shape() == Shape{})
+        {
+            tvl->set_mkldnn_md(required_mds[index]);
+        }
        if (!tvl)
        {
            throw ngraph_error(

--- a/src/ngraph/type/element_type.cpp
+++ b/src/ngraph/type/element_type.cpp
@@ -28,7 +28,7 @@ const element::Type element::f32(32, true, true, false, "float");
 const element::Type element::f64(64, true, true, false, "double");
 const element::Type element::i8(8, false, true, true, "int8_t");
 const element::Type element::i16(16, false, true, false, "int16_t");
-const element::Type element::i32(32, false, true, false, "int32_t");
+const element::Type element::i32(32, false, true, true, "int32_t");
 const element::Type element::i64(64, false, true, false, "int64_t");
 const element::Type element::u8(8, false, false, true, "uint8_t");
 const element::Type element::u16(16, false, false, false, "uint16_t");

--- a/test/builder_quantization.cpp
+++ b/test/builder_quantization.cpp
@@ -208,7 +208,7 @@ TEST(builder, scaled_QC_with_relu)
    copy_data(b, b_data);
    auto result = backend->create_tensor(element::u8, shape_r);
    backend->call_with_validate(f, {result}, {a, b});
-    EXPECT_EQ((vector<uint8_t>{0, 0, 0, 0, 0, 0, 69, 106, 90}), read_vector<uint8_t>(result));
+    EXPECT_EQ((vector<uint8_t>{0, 0, 0, 0, 0, 0, 138, 212, 181}), read_vector<uint8_t>(result));
 }

 TEST(builder, scaled_QC_with_bias)
@@ -300,7 +300,53 @@ TEST(builder, scaled_QC_with_bias_and_relu)
    copy_data(c, c_data);
    auto result = backend->create_tensor(element::u8, shape_r);
    backend->call_with_validate(f, {result}, {a, b, c});
-    EXPECT_EQ((vector<uint8_t>{0, 0, 0, 0, 0, 0, 96, 133, 117}), read_vector<uint8_t>(result));
+    EXPECT_EQ((vector<uint8_t>{0, 0, 0, 0, 0, 0, 191, 255, 234}), read_vector<uint8_t>(result));
+}
+
+TEST(builder, scaled_QC_with_f32_bias_and_relu)
+{
+    Shape shape_a{1, 1, 3, 3}; // input shape
+    Shape shape_b{1, 1, 3, 3}; // filter shape
+    Shape shape_r{1, 1, 3, 3}; // output shape
+    vector<uint8_t> a_data = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    vector<int8_t> b_data = {1, 2, 1, 0, 0, 0, -1, -2, -1};
+    vector<float> c_data = {5};
+    auto A = make_shared<op::Parameter>(element::u8, shape_a);
+    auto B = make_shared<op::Parameter>(element::i8, shape_b);
+    auto Bias = make_shared<op::Parameter>(element::f32, Shape{1});
+    auto C = op::Constant::create(element::f32, Shape{}, {0.0f});
+    auto D = op::Constant::create(element::f32, Shape{}, {255.0f});
+    auto E = op::Constant::create(element::f32, Shape{}, {-127.0f});
+    auto F = op::Constant::create(element::f32, Shape{}, {127.0f});
+    auto G = op::Constant::create(element::f32, Shape{}, {20.0f});
+    auto H = op::Constant::create(element::f32, Shape{}, {-24.0f});
+    auto CV = ngraph::builder::ScaledQuantizedConvolutionBias(A,
+                                                              B,
+                                                              Bias,
+                                                              Strides{1, 1}, // move_strides
+                                                              Strides{1, 1}, // filter_dilation
+                                                              CoordinateDiff{1, 1}, // below_pads
+                                                              CoordinateDiff{1, 1}, // above_pads
+                                                              Strides{1, 1},        // data_dilation
+                                                              C,
+                                                              D,
+                                                              E,
+                                                              F,
+                                                              G,
+                                                              H,
+                                                              true);
+    auto f = make_shared<Function>(NodeVector{CV}, op::ParameterVector{A, B, Bias});
+    auto backend = runtime::Backend::create("CPU");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::u8, shape_a);
+    copy_data(a, a_data);
+    auto b = backend->create_tensor(element::i8, shape_b);
+    copy_data(b, b_data);
+    auto c = backend->create_tensor(element::f32, Shape{1});
+    copy_data(c, c_data);
+    auto result = backend->create_tensor(element::u8, shape_r);
+    backend->call_with_validate(f, {result}, {a, b, c});
+    EXPECT_EQ((vector<uint8_t>{0, 0, 0, 0, 0, 0, 191, 255, 234}), read_vector<uint8_t>(result));
 }

 TEST(builder, scaled_Q_unsigned)