More quantized fusion patterns (#2480)

* Add QuantizedConcat * Remove unused variables and add check for size of mins and maxes vector * Resolve conflicts * Merged with master and addressed some PR feedback * Maxpool and Avgpool fusions. Exclude Q from conv+relu fusion * Remove single-user check from fusions * Quantized concat fusion * workaround: do reshape sinking by default * style fix * check scales for QuantizedConcat * use compare_constants * remove stale comment * Handle all concat cases from arg size 2 to 6 * addressed feedback

More quantized fusion patterns (#2480)
* Add QuantizedConcat * Remove unused variables and add check for size of mins and maxes vector * Resolve conflicts * Merged with master and addressed some PR feedback * Maxpool and Avgpool fusions. Exclude Q from conv+relu fusion * Remove single-user check from fusions * Quantized concat fusion * workaround: do reshape sinking by default * style fix * check scales for QuantizedConcat * use compare_constants * remove stale comment * Handle all concat cases from arg size 2 to 6 * addressed feedback
b8106133 · Jayaram Bobba · Scott Cyphers · 3863180d · b8106133 · b8106133
Commit b8106133 authored Feb 26, 2019 by Jayaram Bobba Committed by Scott Cyphers Feb 26, 2019
Showing with 283 additions and 34 deletions

cpu_fusion.cpp src/ngraph/runtime/cpu/pass/cpu_fusion.cpp +148 -34

cpu_fusion.hpp src/ngraph/runtime/cpu/pass/cpu_fusion.hpp +6 -0

cpu_fusion.cpp test/cpu_fusion.cpp +129 -0

No files found.
--- a/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_fusion.cpp
@@ -21,9 +21,11 @@
 #include <unordered_set>

 #include "cpu_fusion.hpp"
+#include "ngraph/builder/make_constant.hpp"
 #include "ngraph/graph_util.hpp"
 #include "ngraph/log.hpp"
 #include "ngraph/op/add.hpp"
+#include "ngraph/op/avg_pool.hpp"
 #include "ngraph/op/batch_norm.hpp"
 #include "ngraph/op/broadcast.hpp"
 #include "ngraph/op/broadcast.hpp"
@@ -35,9 +37,12 @@
 #include "ngraph/op/divide.hpp"
 #include "ngraph/op/dot.hpp"
 #include "ngraph/op/exp.hpp"
+#include "ngraph/op/experimental/quantized_avg_pool.hpp"
+#include "ngraph/op/experimental/quantized_concat.hpp"
 #include "ngraph/op/experimental/quantized_conv.hpp"
 #include "ngraph/op/experimental/quantized_conv_bias.hpp"
 #include "ngraph/op/experimental/quantized_conv_relu.hpp"
+#include "ngraph/op/experimental/quantized_max_pool.hpp"
 #include "ngraph/op/get_output_element.hpp"
 #include "ngraph/op/max_pool.hpp"
 #include "ngraph/op/maximum.hpp"
@@ -1870,7 +1875,7 @@ void ngraph::runtime::cpu::pass::CPUFusion::construct_update_slice()
    this->add_matcher(m);
 }

-// QuantizedConvolution + Dequantize + Relu + Quantize -> QuantizedConvolutionRelu
+// QuantizedConvolution + Dequantize + Relu -> QuantizedConvolutionRelu + Dequantize
 void ngraph::runtime::cpu::pass::CPUQuantFusion::construct_qconv_relu(bool with_bias)
 {
    Shape shape{2, 2, 1, 1};
@@ -1879,9 +1884,6 @@ void ngraph::runtime::cpu::pass::CPUQuantFusion::construct_qconv_relu(bool with_
    auto requantization_scale = std::make_shared<pattern::op::Label>(element::f32, Shape{});
    auto dq_scale = std::make_shared<pattern::op::Label>(element::f32, Shape{});
    auto dq_zp = std::make_shared<pattern::op::Label>(element::i8, Shape{});
-    auto q_scale = std::make_shared<pattern::op::Label>(element::f32, Shape{});
-    auto q_zp = std::make_shared<pattern::op::Label>(element::u8, Shape{});
-    op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_EVEN;

    std::shared_ptr<ngraph::op::Op> qconv;
    if (with_bias)
@@ -1911,31 +1913,16 @@ void ngraph::runtime::cpu::pass::CPUQuantFusion::construct_qconv_relu(bool with_
    }
    auto dq = std::make_shared<op::Dequantize>(qconv, dq_scale, dq_zp, element::f32, AxisSet{});
    auto relu = std::make_shared<op::Relu>(dq);
-    auto q =
-        std::make_shared<op::Quantize>(relu, q_scale, q_zp, element::u8, AxisSet{}, round_mode);

    pattern::graph_rewrite_callback callback = [with_bias](pattern::Matcher& m) {
        NGRAPH_DEBUG << "In a callback for construct_qconv_relu against "
                     << m.get_match_root()->get_name();

-        auto q_m = std::static_pointer_cast<op::Quantize>(m.get_match_root());
-        auto dq_m = std::static_pointer_cast<op::Dequantize>(q_m->get_argument(0)->get_argument(0));
-
-        if (!(ngraph::is_zero(q_m->get_argument(2)) && ngraph::is_zero(dq_m->get_argument(2))))
-        {
-            NGRAPH_DEBUG << "Non-zero zero points";
-            return false;
-        }
-
-        if (q_m->get_round_mode() != op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_EVEN)
-        {
-            NGRAPH_DEBUG << "Unsupported round mode for fused kernel";
-            return false;
-        }
+        auto dq_m = std::static_pointer_cast<op::Dequantize>(m.get_match_root()->get_argument(0));

-        if (q_m->get_element_type() != element::u8)
+        if (!(ngraph::is_zero(dq_m->get_argument(2))))
        {
-            NGRAPH_DEBUG << "Quantize op produces non uint8 output";
+            NGRAPH_DEBUG << "Non-zero zero point";
            return false;
        }

@@ -1960,9 +1947,6 @@ void ngraph::runtime::cpu::pass::CPUQuantFusion::construct_qconv_relu(bool with_
        {
            auto qconv_m =
                std::static_pointer_cast<op::QuantizedConvolutionBias>(dq_m->get_argument(0));
-            // Rescale to q_m's scales directly
-            auto requant_scale =
-                qconv_m->get_argument(3) * dq_m->get_argument(1) / q_m->get_argument(1);
            qconv_n = std::make_shared<op::QuantizedConvolutionBias>(
                qconv_m->get_argument(0),
                qconv_m->get_argument(1),
@@ -1972,16 +1956,13 @@ void ngraph::runtime::cpu::pass::CPUQuantFusion::construct_qconv_relu(bool with_
                qconv_m->get_padding_below(),
                qconv_m->get_padding_above(),
                qconv_m->get_data_dilation_strides(),
-                requant_scale,
+                qconv_m->get_argument(3),
                true);
        }
        else
        {
            auto qconv_m =
                std::static_pointer_cast<op::QuantizedConvolution>(dq_m->get_argument(0));
-            // Rescale to q_m's scales directly
-            auto requant_scale =
-                qconv_m->get_argument(2) * dq_m->get_argument(1) / q_m->get_argument(1);
            qconv_n = std::make_shared<op::QuantizedConvolutionRelu>(
                qconv_m->get_argument(0),
                qconv_m->get_argument(1),
@@ -1990,24 +1971,158 @@ void ngraph::runtime::cpu::pass::CPUQuantFusion::construct_qconv_relu(bool with_
                qconv_m->get_padding_below(),
                qconv_m->get_padding_above(),
                qconv_m->get_data_dilation_strides(),
-                requant_scale);
+                qconv_m->get_argument(2));
        }
-        ngraph::replace_node(m.get_match_root(), qconv_n);
+        auto zp =
+            builder::make_constant<uint8_t>(element::u8, dq_m->get_argument(1)->get_shape(), 0);
+        auto dq_n = std::make_shared<op::Dequantize>(
+            qconv_n, dq_m->get_argument(1), zp, dq_m->get_output_element_type(0), dq_m->get_axes());
+        ngraph::replace_node(m.get_match_root(), dq_n);
        return true;
    };

    std::shared_ptr<pattern::Matcher> m;
    if (with_bias)
    {
-        m = std::make_shared<pattern::Matcher>(q, callback, "CPUQuantFusion.QConvBiasRelu");
+        m = std::make_shared<pattern::Matcher>(relu, callback, "CPUQuantFusion.QConvBiasRelu");
    }
    else
    {
-        m = std::make_shared<pattern::Matcher>(q, callback, "CPUQuantFusion.QConvRelu");
+        m = std::make_shared<pattern::Matcher>(relu, callback, "CPUQuantFusion.QConvRelu");
    }
    this->add_matcher(m);
 }

+// Dequantize + AvgPool -> QuantizedAvgPool + Dequantize
+void ngraph::runtime::cpu::pass::CPUQuantFusion::construct_qavg_pool()
+{
+    Shape shape{2, 2, 1, 1};
+    auto input = std::make_shared<pattern::op::Label>(element::i8, shape);
+    auto dq_scale = std::make_shared<pattern::op::Label>(element::f32, Shape{});
+    auto dq_zp = std::make_shared<pattern::op::Label>(element::i8, Shape{});
+    auto dq = std::make_shared<op::Dequantize>(input, dq_scale, dq_zp, element::f32, AxisSet{});
+    auto avg_pool = std::make_shared<op::AvgPool>(dq, Shape{1, 1});
+
+    pattern::graph_rewrite_callback callback = [](pattern::Matcher& m) {
+        NGRAPH_DEBUG << "In a callback for construct_qavg_pool against "
+                     << m.get_match_root()->get_name();
+
+        auto avg_pool_m = std::static_pointer_cast<op::AvgPool>(m.get_match_root());
+        auto dq_m = std::static_pointer_cast<op::Dequantize>(avg_pool_m->get_argument(0));
+
+        auto qavg_pool_n = std::make_shared<op::QuantizedAvgPool>(
+            dq_m->get_argument(0),
+            avg_pool_m->get_window_shape(),
+            avg_pool_m->get_window_movement_strides(),
+            avg_pool_m->get_padding_below(),
+            avg_pool_m->get_padding_above(),
+            avg_pool_m->get_include_padding_in_avg_computation());
+        auto dq_n = std::make_shared<op::Dequantize>(qavg_pool_n,
+                                                     dq_m->get_argument(1),
+                                                     dq_m->get_argument(2),
+                                                     dq_m->get_output_element_type(0),
+                                                     dq_m->get_axes());
+        ngraph::replace_node(m.get_match_root(), dq_n);
+        return true;
+    };
+
+    this->add_matcher(
+        std::make_shared<pattern::Matcher>(avg_pool, callback, "CPUQuantFusion.QAvgPool"));
+}
+
+// Dequantize + Maxpool -> QuantizedMaxpool + Dequantize
+void ngraph::runtime::cpu::pass::CPUQuantFusion::construct_qmax_pool()
+{
+    Shape shape{2, 2, 1, 1};
+    auto input = std::make_shared<pattern::op::Label>(element::i8, shape);
+    auto dq_scale = std::make_shared<pattern::op::Label>(element::f32, Shape{});
+    auto dq_zp = std::make_shared<pattern::op::Label>(element::i8, Shape{});
+    auto dq = std::make_shared<op::Dequantize>(input, dq_scale, dq_zp, element::f32, AxisSet{});
+    auto max_pool = std::make_shared<op::MaxPool>(dq, Shape{1, 1});
+
+    pattern::graph_rewrite_callback callback = [](pattern::Matcher& m) {
+        NGRAPH_DEBUG << "In a callback for construct_qmax_pool against "
+                     << m.get_match_root()->get_name();
+
+        auto max_pool_m = std::static_pointer_cast<op::MaxPool>(m.get_match_root());
+        auto dq_m = std::static_pointer_cast<op::Dequantize>(max_pool_m->get_argument(0));
+
+        auto qmax_pool_n =
+            std::make_shared<op::QuantizedMaxPool>(dq_m->get_argument(0),
+                                                   max_pool_m->get_window_shape(),
+                                                   max_pool_m->get_window_movement_strides(),
+                                                   max_pool_m->get_padding_below(),
+                                                   max_pool_m->get_padding_above());
+        auto dq_n = std::make_shared<op::Dequantize>(qmax_pool_n,
+                                                     dq_m->get_argument(1),
+                                                     dq_m->get_argument(2),
+                                                     dq_m->get_output_element_type(0),
+                                                     dq_m->get_axes());
+        ngraph::replace_node(m.get_match_root(), dq_n);
+        return true;
+    };
+
+    this->add_matcher(
+        std::make_shared<pattern::Matcher>(max_pool, callback, "CPUQuantFusion.QMaxPool"));
+}
+
+// {Dequantize}* + Concat -> QuantizedConcat + Dequantize
+void ngraph::runtime::cpu::pass::CPUQuantFusion::construct_qconcat()
+{
+    Shape shape{2, 2, 1, 1};
+    NodeVector inputs;
+    NodeVector concats;
+    // Pattern matcher looks for concats with exact number of inputs
+    inputs.push_back(std::make_shared<pattern::op::Label>(element::f32, shape));
+    // Concat2, Concat3, ... Concat6
+    for (size_t i = 0; i < 5; i++)
+    {
+        inputs.push_back(std::make_shared<pattern::op::Label>(element::f32, shape));
+        concats.push_back(std::make_shared<op::Concat>(inputs, 0));
+    }
+
+    pattern::graph_rewrite_callback callback = [](pattern::Matcher& m) {
+        NGRAPH_DEBUG << "In a callback for construct_qconcat against "
+                     << m.get_match_root()->get_name();
+
+        auto concat_m = std::static_pointer_cast<op::Concat>(m.get_match_root());
+        auto dq_m = std::static_pointer_cast<op::Dequantize>(concat_m->get_argument(0));
+        NodeVector new_args;
+        for (auto arg : concat_m->get_arguments())
+        {
+            if (arg->description() != "Dequantize")
+            {
+                return false;
+            }
+
+            // ensure dequant scales are same
+            if (!ngraph::compare_constants(arg->get_argument(1), dq_m->get_argument(1)))
+            {
+                NGRAPH_DEBUG << "QuantizedConcat: Dequantize scale must be same";
+                return false;
+            }
+
+            new_args.push_back(arg->get_argument(0));
+        }
+        auto concat_n =
+            std::make_shared<op::QuantizedConcat>(new_args, concat_m->get_concatenation_axis());
+        auto dq_n = std::make_shared<op::Dequantize>(concat_n,
+                                                     dq_m->get_argument(1),
+                                                     dq_m->get_argument(2),
+                                                     dq_m->get_element_type(),
+                                                     dq_m->get_axes());
+        ngraph::replace_node(m.get_match_root(), dq_n);
+
+        return true;
+    };
+
+    for (size_t i = 0; i < 5; i++)
+    {
+        this->add_matcher(std::make_shared<pattern::Matcher>(
+            concats[i], callback, "CPUQuantFusion.QConcat" + std::to_string(i + 2)));
+    }
+}
+
 void ngraph::runtime::cpu::pass::CPUQuantFusion::construct_dq_q()
 {
    Shape shape{2, 2, 1, 1};
@@ -2103,7 +2218,6 @@ void ngraph::runtime::cpu::pass::CPUQuantFusion::construct_qconvb_add()

    //Add left + right
    auto add = skipb_l + skipb_r;
-    ;
    auto prelu = std::make_shared<op::Relu>(add);

    pattern::graph_rewrite_callback callback = [dq_l_label, dq_r_label](pattern::Matcher& m) {

--- a/src/ngraph/runtime/cpu/pass/cpu_fusion.hpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_fusion.hpp
@@ -111,12 +111,18 @@ public:
    {
        construct_qconv_relu(true);
        construct_qconv_relu(false);
+        construct_qavg_pool();
+        construct_qmax_pool();
+        construct_qconcat();
        construct_qconvb_add();
        construct_dq_q();
    }

 private:
    void construct_qconv_relu(bool with_bias);
+    void construct_qavg_pool();
+    void construct_qmax_pool();
+    void construct_qconcat();
    void construct_dq_q();
    void construct_qconvb_add();
 };
--- a/test/cpu_fusion.cpp
+++ b/test/cpu_fusion.cpp
@@ -30,6 +30,7 @@
 #include "ngraph/op/batch_norm.hpp"
 #include "ngraph/op/concat.hpp"
 #include "ngraph/op/dequantize.hpp"
+#include "ngraph/op/experimental/quantized_concat.hpp"
 #include "ngraph/op/experimental/quantized_conv.hpp"
 #include "ngraph/op/experimental/quantized_conv_bias.hpp"
 #include "ngraph/op/get_output_element.hpp"
@@ -3604,6 +3605,134 @@ TEST(cpu_quant_fusion, qconvb_relu)
    EXPECT_TRUE(test::all_close(cpu1_results.at(0), cpu2_results.at(0)));
 }

+TEST(cpu_quant_fusion, qavg_pool)
+{
+    auto make_function = []() {
+        Shape shape_input{1, 2, 4, 4};
+        auto input = std::make_shared<op::Parameter>(element::f32, shape_input);
+        auto input_scale = op::Constant::create(element::f32, Shape{}, {2.0f});
+        auto weights_scale = op::Constant::create(element::f32, Shape{}, {2.0f});
+        auto int8_zero = op::Constant::create(element::i8, Shape{}, {0});
+        auto uint8_zero = op::Constant::create(element::u8, Shape{}, {0});
+
+        op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_EVEN;
+        auto q_input = std::make_shared<op::Quantize>(
+            input, input_scale, uint8_zero, element::u8, AxisSet{}, round_mode);
+        auto dq = std::make_shared<op::Dequantize>(
+            q_input, input_scale, uint8_zero, element::f32, AxisSet{});
+        auto avg_pool = std::make_shared<op::AvgPool>(dq, Shape{2, 2});
+        return make_shared<Function>(NodeVector{avg_pool}, ParameterVector{input});
+    };
+
+    auto cpu_f1 = make_function();
+    auto cpu_f2 = make_function();
+
+    test::Uniform<float> rng(4.0f, 4.0f);
+    vector<vector<float>> args;
+    for (shared_ptr<op::Parameter> param : cpu_f1->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+
+    set_environment("NGRAPH_PASS_ENABLES", "CPUQuantFusion:0", 1);
+    auto cpu1_results = execute(cpu_f1, args, "CPU");
+    set_environment("NGRAPH_PASS_ENABLES", "CPUQuantFusion:1", 1);
+    auto cpu2_results = execute(cpu_f2, args, "CPU");
+    EXPECT_TRUE(test::all_close(cpu1_results.at(0), cpu2_results.at(0)));
+}
+
+TEST(cpu_quant_fusion, qmax_pool)
+{
+    auto make_function = []() {
+        Shape shape_input{1, 2, 4, 4};
+        auto input = std::make_shared<op::Parameter>(element::f32, shape_input);
+        auto input_scale = op::Constant::create(element::f32, Shape{}, {2.0f});
+        auto weights_scale = op::Constant::create(element::f32, Shape{}, {2.0f});
+        auto int8_zero = op::Constant::create(element::i8, Shape{}, {0});
+        auto uint8_zero = op::Constant::create(element::u8, Shape{}, {0});
+
+        op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_EVEN;
+        auto q_input = std::make_shared<op::Quantize>(
+            input, input_scale, uint8_zero, element::u8, AxisSet{}, round_mode);
+        auto dq = std::make_shared<op::Dequantize>(
+            q_input, input_scale, uint8_zero, element::f32, AxisSet{});
+        auto maxpool = std::make_shared<op::MaxPool>(dq, Shape{2, 2});
+        return make_shared<Function>(NodeVector{maxpool}, ParameterVector{input});
+    };
+
+    auto cpu_f1 = make_function();
+    auto cpu_f2 = make_function();
+
+    test::Uniform<float> rng(1.0f, 10.0f);
+    vector<vector<float>> args;
+    for (shared_ptr<op::Parameter> param : cpu_f1->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+
+    set_environment("NGRAPH_PASS_ENABLES", "CPUQuantFusion:0", 1);
+    auto cpu1_results = execute(cpu_f1, args, "CPU");
+    set_environment("NGRAPH_PASS_ENABLES", "CPUQuantFusion:1", 1);
+    auto cpu2_results = execute(cpu_f2, args, "CPU");
+    EXPECT_TRUE(test::all_close(cpu1_results.at(0), cpu2_results.at(0)));
+}
+
+TEST(cpu_quant_fusion, qconcat)
+{
+    auto make_function = []() {
+        auto get_input_slice = [](std::shared_ptr<op::Parameter>& input) {
+            auto input_scale = op::Constant::create(element::f32, Shape{}, {2.0f});
+            auto int8_zero = op::Constant::create(element::i8, Shape{}, {0});
+            auto uint8_zero = op::Constant::create(element::u8, Shape{}, {0});
+
+            op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_EVEN;
+            auto q_input = std::make_shared<op::Quantize>(
+                input, input_scale, uint8_zero, element::u8, AxisSet{}, round_mode);
+            auto dq = std::make_shared<op::Dequantize>(
+                q_input, input_scale, uint8_zero, element::f32, AxisSet{});
+            return dq;
+        };
+
+        NodeVector concat_inputs, concats;
+        ParameterVector inputs;
+        Shape shape_input{1, 2, 4, 4};
+        inputs.push_back(std::make_shared<op::Parameter>(element::f32, shape_input));
+        concat_inputs.push_back(get_input_slice(inputs.back()));
+        // Concat2  -- Concat7
+        for (size_t i = 0; i < 6; i++)
+        {
+            inputs.push_back(std::make_shared<op::Parameter>(element::f32, shape_input));
+            concat_inputs.push_back(get_input_slice(inputs.back()));
+            concats.push_back(std::make_shared<op::Concat>(concat_inputs, 0));
+        }
+        return make_shared<Function>(concats, inputs);
+    };
+
+    auto cpu_f1 = make_function();
+    auto cpu_f2 = make_function();
+
+    test::Uniform<float> rng(2.0f, 2.0f);
+    vector<vector<float>> args;
+    for (shared_ptr<op::Parameter> param : cpu_f1->get_parameters())
+    {
+        vector<float> tensor_val(shape_size(param->get_shape()));
+        rng.initialize(tensor_val);
+        args.push_back(tensor_val);
+    }
+
+    set_environment("NGRAPH_PASS_ENABLES", "CPUQuantFusion:0", 1);
+    auto cpu1_results = execute(cpu_f1, args, "CPU");
+    set_environment("NGRAPH_PASS_ENABLES", "CPUQuantFusion:1", 1);
+    auto cpu2_results = execute(cpu_f2, args, "CPU");
+    // Expect Concat2 -- Concat6 to be fused and not Concat7
+    ASSERT_EQ(count_ops_of_type<op::QuantizedConcat>(cpu_f2), 5);
+    EXPECT_TRUE(test::all_close(cpu1_results.at(0), cpu2_results.at(0)));
+}
+
 TEST(cpu_quant_fusion, dq_q)
 {
    auto make_function = [](bool match_scales = true, bool match_et = true) {