Add support for Quantized Pooling(Max + Avg) op via mkldnn for IA backend (codegen + DEX) (#1571)

* Add support for Quantized Pooling(Max + Avg) op via mkldnn for IA backend (codegen + DEX) * Add checks for min and max * Extracting out the common code from codegen and DEX * Use call_with_validate

Add support for Quantized Pooling(Max + Avg) op via mkldnn for IA backend (codegen + DEX) (#1571)
* Add support for Quantized Pooling(Max + Avg) op via mkldnn for IA backend (codegen + DEX) * Add checks for min and max * Extracting out the common code from codegen and DEX * Use call_with_validate
20c2325c · Nishant Patel · Robert Kimball · 1cdae06e · 20c2325c · 20c2325c
Commit 20c2325c authored Sep 12, 2018 by Nishant Patel Committed by Robert Kimball Sep 12, 2018
15 changed files
--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -52,6 +52,8 @@ set(SRC
    builder/reduce_function.cpp
    builder/reduce_function_window.cpp
    builder/replace_slice.cpp
+    builder/quantized_max_pool.cpp
+    builder/quantized_avg_pool.cpp
    builder/reshape.cpp
    builder/reverse.cpp
    builder/reverse_sequence.cpp
@@ -83,6 +85,8 @@ set(SRC
    op/lstm.cpp
    op/matmul_bias.cpp
    op/max_pool_with_indices.cpp
+    op/quantized_max_pool.cpp
+    op/quantized_avg_pool.cpp
    op/rnn.cpp
    op/sigmoid_mul.cpp
    op/conv_add.cpp

--- a/src/ngraph/runtime/cpu/builder/quantized_avg_pool.cpp
+++ b/src/ngraph/runtime/cpu/builder/quantized_avg_pool.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
+#include "ngraph/op/constant.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
+#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::QuantizedAvgPool)
+            {
+                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
+                {
+                    auto& functors = external_function->get_functors();
+                    auto& arg_tensor = external_function->get_tensor_data(args[0].get_name());
+                    auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
+                    auto& out1_tensor = external_function->get_tensor_data(out[1].get_name());
+                    auto& out2_tensor = external_function->get_tensor_data(out[2].get_name());
+                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                    vector<float> quant_util;
+                    mkldnn_emitter->build_quantized_avg_pool(node, quant_util);
+                    auto& deps = mkldnn_emitter->get_primitive_deps(quant_util[2]);
+                    auto functor = [&, quant_util](CPURuntimeContext* ctx) {
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
+                        *(static_cast<float*>(out1_tensor)) = quant_util[0];
+                        *(static_cast<float*>(out2_tensor)) = quant_util[1];
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, quant_util[2]);
+                    };
+                    functors.emplace_back(functor);
+                }
+                else
+                {
+                    throw ngraph_error("unsupported parameters for QuantizedAvgPool via DEX");
+                }
+            }
+            REGISTER_OP_BUILDER(QuantizedAvgPool);
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/builder/quantized_max_pool.cpp
+++ b/src/ngraph/runtime/cpu/builder/quantized_max_pool.cpp
+/*******************************************************************************
+* Copyright 2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
+#include "ngraph/op/constant.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
+#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::QuantizedMaxPool)
+            {
+                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
+                {
+                    auto& functors = external_function->get_functors();
+                    auto& arg_tensor = external_function->get_tensor_data(args[0].get_name());
+                    auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
+                    auto& out1_tensor = external_function->get_tensor_data(out[1].get_name());
+                    auto& out2_tensor = external_function->get_tensor_data(out[2].get_name());
+                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                    vector<float> quant_util;
+                    mkldnn_emitter->build_quantized_max_pool(node, quant_util);
+                    auto& deps = mkldnn_emitter->get_primitive_deps(quant_util[2]);
+                    auto functor = [&, quant_util](CPURuntimeContext* ctx) {
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
+                        *(static_cast<float*>(out1_tensor)) = quant_util[0];
+                        *(static_cast<float*>(out2_tensor)) = quant_util[1];
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, quant_util[2]);
+                    };
+                    functors.emplace_back(functor);
+                }
+                else
+                {
+                    throw ngraph_error("unsupported parameters for QuantizedMaxPool via DEX");
+                }
+            }
+            REGISTER_OP_BUILDER(QuantizedMaxPool);
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -109,6 +109,8 @@
 #include "ngraph/runtime/cpu/op/lstm.hpp"
 #include "ngraph/runtime/cpu/op/matmul_bias.hpp"
 #include "ngraph/runtime/cpu/op/max_pool_with_indices.hpp"
+#include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
+#include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
 #include "ngraph/runtime/cpu/op/rnn.hpp"
 #include "ngraph/runtime/cpu/op/sigmoid.hpp"
 #include "ngraph/runtime/cpu/op/sigmoid_mul.hpp"
@@ -3072,6 +3074,54 @@ namespace ngraph
                }
            }
+            template <>
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::QuantizedMaxPool)
+            {
+                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
+                {
+                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                    vector<float> quant_util;
+                    mkldnn_emitter->build_quantized_max_pool(node, quant_util);
+                    auto& deps = mkldnn_emitter->get_primitive_deps(quant_util[2]);
+                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0])
+                           << ", " << args[0].get_name() << ");\n";
+                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1])
+                           << ", " << out[0].get_name() << ");\n";
+                    writer << "*(" << out[1].get_name() << ") = " << quant_util[0] << ";\n";
+                    writer << "*(" << out[2].get_name() << ") = " << quant_util[1] << ";\n";
+                    writer << "cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, "
+                           << to_string(quant_util[2]) << ");\n";
+                }
+                else
+                {
+                    throw ngraph_error("unsupported parameters for QuantizedMaxPool");
+                }
+            }
+            template <>
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::QuantizedAvgPool)
+            {
+                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
+                {
+                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                    vector<float> quant_util;
+                    mkldnn_emitter->build_quantized_avg_pool(node, quant_util);
+                    auto& deps = mkldnn_emitter->get_primitive_deps(quant_util[2]);
+                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0])
+                           << ", " << args[0].get_name() << ");\n";
+                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1])
+                           << ", " << out[0].get_name() << ");\n";
+                    writer << "*(" << out[1].get_name() << ") = " << quant_util[0] << ";\n";
+                    writer << "*(" << out[2].get_name() << ") = " << quant_util[1] << ";\n";
+                    writer << "cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, "
+                           << to_string(quant_util[2]) << ");\n";
+                }
+                else
+                {
+                    throw ngraph_error("unsupported parameters for QuantizedAvgPool");
+                }
+            }
            template <>
            void CPU_Emitter::EMITTER_DECL(ngraph::op::MaxPoolWithIndices)
            {

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -149,6 +149,8 @@
 #include "ngraph/runtime/cpu/op/lstm.hpp"
 #include "ngraph/runtime/cpu/op/matmul_bias.hpp"
 #include "ngraph/runtime/cpu/op/max_pool_with_indices.hpp"
+#include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
+#include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
 #include "ngraph/runtime/cpu/op/rnn.hpp"
 #include "ngraph/runtime/cpu/op/sigmoid.hpp"
 #include "ngraph/runtime/cpu/op/sigmoid_mul.hpp"
@@ -310,6 +312,8 @@ static const runtime::cpu::OpMap dispatcher{
     &runtime::cpu::CPU_Emitter::emit<runtime::cpu::op::ConvertLayout>},
    {TI(ngraph::op::Not), &runtime::cpu::CPU_Emitter::emit<op::Not>},
    {TI(ngraph::op::MaxPool), &runtime::cpu::CPU_Emitter::emit<op::MaxPool>},
+    {TI(ngraph::op::QuantizedMaxPool), &runtime::cpu::CPU_Emitter::emit<op::QuantizedMaxPool>},
+    {TI(ngraph::op::QuantizedAvgPool), &runtime::cpu::CPU_Emitter::emit<op::QuantizedAvgPool>},
    {TI(ngraph::op::MaxPoolWithIndices), &runtime::cpu::CPU_Emitter::emit<op::MaxPoolWithIndices>},
    {TI(ngraph::op::Reverse), &runtime::cpu::CPU_Emitter::emit<op::Reverse>},
    {TI(ngraph::op::ReverseSequence), &runtime::cpu::CPU_Emitter::emit<op::ReverseSequence>},

--- a/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
@@ -25,6 +25,8 @@
 #include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
 #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
 #include "ngraph/runtime/cpu/op/dequantize.hpp"
+#include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
+#include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
 #include "ngraph/type/element_type.hpp"
 using namespace ngraph::runtime::cpu;
@@ -147,6 +149,53 @@ size_t MKLDNNEmitter::build_dequantization(const ngraph::Node* node,
    return dequantize_index;
 }
+void MKLDNNEmitter::build_quantized_max_pool(const ngraph::Node* node,
+                                             std::vector<float>& quant_util)
+{
+    auto qmax_pool = static_cast<const ngraph::op::QuantizedMaxPool*>(node);
+    auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
+    auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
+    size_t qmax_pool_index = this->build_pooling_forward(mkldnn::algorithm::pooling_max,
+                                                         input_desc,
+                                                         result_desc,
+                                                         qmax_pool->get_window_movement_strides(),
+                                                         qmax_pool->get_window_shape(),
+                                                         qmax_pool->get_padding_below(),
+                                                         qmax_pool->get_padding_above());
+    auto min_const_op = std::static_pointer_cast<ngraph::op::Constant>(qmax_pool->get_argument(1));
+    auto max_const_op = std::static_pointer_cast<ngraph::op::Constant>(qmax_pool->get_argument(2));
+    float min = *(static_cast<float const*>(min_const_op->get_data_ptr()));
+    float max = *(static_cast<float const*>(max_const_op->get_data_ptr()));
+    quant_util.push_back(min);
+    quant_util.push_back(max);
+    quant_util.push_back(qmax_pool_index);
+}
+void MKLDNNEmitter::build_quantized_avg_pool(const ngraph::Node* node,
+                                             std::vector<float>& quant_util)
+{
+    auto qavg_pool = static_cast<const ngraph::op::QuantizedAvgPool*>(node);
+    auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
+    auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
+    size_t qavg_pool_index =
+        this->build_pooling_forward((qavg_pool->get_include_padding_in_avg_computation()
+                                         ? mkldnn::algorithm::pooling_avg_include_padding
+                                         : mkldnn::algorithm::pooling_avg_exclude_padding),
+                                    input_desc,
+                                    result_desc,
+                                    qavg_pool->get_window_movement_strides(),
+                                    qavg_pool->get_window_shape(),
+                                    qavg_pool->get_padding_below(),
+                                    qavg_pool->get_padding_above());
+    auto min_const_op = std::static_pointer_cast<ngraph::op::Constant>(qavg_pool->get_argument(1));
+    auto max_const_op = std::static_pointer_cast<ngraph::op::Constant>(qavg_pool->get_argument(2));
+    float min = *(static_cast<float const*>(min_const_op->get_data_ptr()));
+    float max = *(static_cast<float const*>(max_const_op->get_data_ptr()));
+    quant_util.push_back(min);
+    quant_util.push_back(max);
+    quant_util.push_back(qavg_pool_index);
+}
 mkldnn::memory::format MKLDNNEmitter::query_convolution_forward_weight_format(
    const mkldnn::memory::desc& input_data_desc,
    const mkldnn::memory::desc& weights_desc_any,

--- a/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
@@ -519,6 +519,12 @@ namespace ngraph
                                            const mkldnn::memory::desc& input_desc,
                                            const mkldnn::memory::desc& result_desc);
+                void build_quantized_max_pool(const ngraph::Node* node,
+                                              std::vector<float>& quant_util);
+                void build_quantized_avg_pool(const ngraph::Node* node,
+                                              std::vector<float>& quant_util);
            private:
                std::vector<mkldnn::primitive*> m_mkldnn_primitives;
                std::vector<mkldnn::stream> m_mkldnn_streams;

--- a/src/ngraph/runtime/cpu/mkldnn_utils.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_utils.cpp
@@ -235,12 +235,12 @@ mkldnn::memory::desc runtime::cpu::mkldnn_utils::create_default_mkldnn_md(
    if (output)
    {
        shape = node->get_output_shape(index);
-        et = runtime::cpu::mkldnn_utils::get_mkldnn_data_type(node->get_output_element_type(0));
+        et = runtime::cpu::mkldnn_utils::get_mkldnn_data_type(node->get_output_element_type(index));
    }
    else
    {
        shape = node->get_input_shape(index);
-        et = runtime::cpu::mkldnn_utils::get_mkldnn_data_type(node->get_input_element_type(0));
+        et = runtime::cpu::mkldnn_utils::get_mkldnn_data_type(node->get_input_element_type(index));
    }
    return memory::desc(memory::dims(shape.begin(), shape.end()), et, format);

--- a/src/ngraph/runtime/cpu/op/quantized_avg_pool.cpp
+++ b/src/ngraph/runtime/cpu/op/quantized_avg_pool.cpp
--- a/src/ngraph/runtime/cpu/op/quantized_avg_pool.hpp
+++ b/src/ngraph/runtime/cpu/op/quantized_avg_pool.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include "ngraph/graph_util.hpp"
+#include "ngraph/op/op.hpp"
+namespace ngraph
+{
+    namespace op
+    {
+        /// \brief Batched average pooling operation, with optional padding and window stride.
+        ///
+        class QuantizedAvgPool : public Op
+        {
+        public:
+            /// \brief Constructs a batched average pooling operation.
+            ///
+            /// \param arg The node producing the input data batch tensor.<br>
+            /// `[d1, dn]`
+            /// \param window_shape The window shape.<br>
+            /// `[n]`
+            /// \param window_movement_strides The window movement strides.<br>
+            /// `[n]`
+            /// \param padding_below The below-padding shape.<br>
+            /// `[n]`
+            /// \param padding_above The above-padding shape.<br>
+            /// `[n]`
+            /// \param include_padding_in_avg_computation If true then averages include padding
+            ///  elements, each treated as the number zero.  If false, padding elements are entirely
+            ///  ignored when computing averages.
+            QuantizedAvgPool(const std::shared_ptr<Node>& arg,
+                             const Shape& window_shape,
+                             const Strides& window_movement_strides,
+                             const Shape& padding_below,
+                             const Shape& padding_above,
+                             bool include_padding_in_avg_computation,
+                             const std::shared_ptr<Node> min,
+                             const std::shared_ptr<Node> max);
+            void validate_and_infer_types() override;
+            virtual std::shared_ptr<Node>
+                copy_with_new_args(const NodeVector& new_args) const override;
+            /// \return The window shape.
+            const Shape& get_window_shape() const { return m_window_shape; }
+            /// \return The window movement strides.
+            const Strides& get_window_movement_strides() const { return m_window_movement_strides; }
+            /// \return The below-padding shape.
+            const Shape& get_padding_below() const { return m_padding_below; }
+            /// \return The above-padding shape.
+            const Shape& get_padding_above() const { return m_padding_above; }
+            bool get_include_padding_in_avg_computation() const
+            {
+                return m_include_padding_in_avg_computation;
+            }
+        protected:
+            Shape m_window_shape;
+            Strides m_window_movement_strides;
+            Shape m_padding_below;
+            Shape m_padding_above;
+            bool m_include_padding_in_avg_computation;
+        };
+    }
+}
--- a/src/ngraph/runtime/cpu/op/quantized_max_pool.cpp
+++ b/src/ngraph/runtime/cpu/op/quantized_max_pool.cpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#include "quantized_max_pool.hpp"
+#include "ngraph/function.hpp"
+#include "ngraph/op/constant.hpp"
+#include "ngraph/util.hpp"
+using namespace std;
+using namespace ngraph;
+op::QuantizedMaxPool::QuantizedMaxPool(const shared_ptr<Node>& arg,
+                                       const Shape& window_shape,
+                                       const Strides& window_movement_strides,
+                                       const Shape& padding_below,
+                                       const Shape& padding_above,
+                                       const shared_ptr<Node> min,
+                                       const shared_ptr<Node> max)
+    : Op("QuantizedMaxPool", check_single_output_args({arg, min, max}))
+    , m_window_shape(window_shape)
+    , m_window_movement_strides(window_movement_strides)
+    , m_padding_below(padding_below)
+    , m_padding_above(padding_above)
+{
+    constructor_validate_and_infer_types();
+    if (arg->get_element_type() != element::u8 && arg->get_element_type() != element::i8)
+    {
+        throw ngraph_error("Dequantization supported only for i8/u8!");
+    }
+    if (min->get_element_type() != max->get_element_type())
+    {
+        throw ngraph_error("Min's element type isn't equal to max's!");
+    }
+    if (!(std::dynamic_pointer_cast<op::Constant>(min) &&
+          std::dynamic_pointer_cast<op::Constant>(max)))
+    {
+        throw ngraph_error("Min and max have to be constants!");
+    }
+}
+void op::QuantizedMaxPool::validate_and_infer_types()
+{
+    auto& arg_shape = get_input_shape(0);
+    if (0 == m_window_movement_strides.size() && arg_shape.size() > 2)
+    {
+        m_window_movement_strides = Strides(arg_shape.size() - 2, 1);
+    }
+    if (0 == m_padding_below.size() && arg_shape.size() > 2)
+    {
+        m_padding_below = Shape(arg_shape.size() - 2, 0);
+    }
+    if (0 == m_padding_above.size() && arg_shape.size() > 2)
+    {
+        m_padding_above = Shape(arg_shape.size() - 2, 0);
+    }
+    //
+    // Make sure batch size and channel count are not zero, and that we have at least one spatial
+    // dimension (in other words, that arg has shape NCDi for some Di of rank>0, N != 0, C != 0).
+    //
+    NODE_VALIDATION_ASSERT(this, arg_shape.size() >= 3)
+        << "Data input shape does not have rank of at least 3 (data input shape: " << arg_shape
+        << ").";
+    size_t batch_size = arg_shape[0];
+    NODE_VALIDATION_ASSERT(this, batch_size != 0)
+        << "Data batch size is zero (data input shape: " << arg_shape << ").";
+    size_t channel_count = arg_shape[1];
+    NODE_VALIDATION_ASSERT(this, channel_count != 0)
+        << "Channel count is zero (data input shape: " << arg_shape << ").";
+    size_t spatial_dimension_count = arg_shape.size() - 2;
+    //
+    // Make sure window shape, window movement strides, and padding have same rank as Di.
+    //
+    NODE_VALIDATION_ASSERT(this, m_window_shape.size() == spatial_dimension_count)
+        << "Window shape rank does not match number of spatial dimensions (window shape: "
+        << m_window_shape << ", data input shape: " << arg_shape << ").";
+    NODE_VALIDATION_ASSERT(this, m_window_movement_strides.size() == spatial_dimension_count)
+        << "Window movement stride rank does not match number of spatial dimensions (window "
+           "movement strides: "
+        << m_window_movement_strides << ", data input shape: " << arg_shape << ").";
+    NODE_VALIDATION_ASSERT(this, m_padding_below.size() == spatial_dimension_count)
+        << "Below-padding rank does not match number of spatial dimensions (padding below: "
+        << m_padding_below << ", data input shape: " << arg_shape << ").";
+    NODE_VALIDATION_ASSERT(this, m_padding_above.size() == spatial_dimension_count)
+        << "Above-padding rank does not match number of spatial dimensions (padding above: "
+        << m_padding_above << ", data input shape: " << arg_shape << ").";
+    //
+    // Extract input item shape Di and make sure all dimensions are larger than 0.
+    //
+    Shape input_item_virtual_shape;
+    for (size_t i = 0; i < spatial_dimension_count; i++)
+    {
+        size_t dim_size = arg_shape[1 + 1 + i];
+        size_t virtual_dim_size = m_padding_below[i] + dim_size + m_padding_above[i];
+        input_item_virtual_shape.push_back(virtual_dim_size);
+    }
+    for (size_t i = 0; i < spatial_dimension_count; i++)
+    {
+        NODE_VALIDATION_ASSERT(this, input_item_virtual_shape[i] != 0)
+            << "Data input spatial dimension " << i
+            << " has zero length even after padding (virtual shape of input item: "
+            << input_item_virtual_shape << ").";
+    }
+    //
+    // Make sure window shape dimensions are all larger than 0.
+    //
+    for (size_t i = 0; i < spatial_dimension_count; i++)
+    {
+        NODE_VALIDATION_ASSERT(this, m_window_shape[i] != 0)
+            << "Window shape dimension " << i
+            << " has zero length (window shape: " << m_window_shape << ").";
+    }
+    //
+    // Make sure the pooling window fits within the spatial dimensions.
+    //
+    for (size_t i = 0; i < spatial_dimension_count; i++)
+    {
+        NODE_VALIDATION_ASSERT(this, m_window_shape[i] <= input_item_virtual_shape[i])
+            << "Window shape after padding is larger than the spatial dimensions (window shape: "
+            << m_window_shape << ", virtual shape of input item: " << input_item_virtual_shape
+            << ").";
+    }
+    //
+    // Compute output item shape Do, checking at the same time that all window movement strides are larger than 0.
+    //
+    Shape output_item_shape;
+    for (size_t i = 0; i < spatial_dimension_count; i++)
+    {
+        NODE_VALIDATION_ASSERT(this, m_window_movement_strides[i] != 0)
+            << "Window movement strides dimension " << i
+            << " has zero length (window movement strides: " << m_window_movement_strides << ").";
+        output_item_shape.push_back(ceil_div(input_item_virtual_shape[i] - m_window_shape[i] + 1,
+                                             m_window_movement_strides[i]));
+    }
+    //
+    // Construct result shape: NCDo.
+    //
+    Shape result_shape(1 + 1 + spatial_dimension_count);
+    result_shape[0] = batch_size;
+    result_shape[1] = channel_count;
+    copy(output_item_shape.begin(), output_item_shape.end(), result_shape.begin() + 2);
+    set_output_size(3);
+    set_output_type(0, get_input_element_type(0), result_shape);
+    //TODO(nbpatel): Change to Shape{} once the mkldnn version is updated.
+    set_output_type(1, element::f32, Shape{1});
+    set_output_type(2, element::f32, Shape{1});
+}
+shared_ptr<Node> op::QuantizedMaxPool::copy_with_new_args(const NodeVector& new_args) const
+{
+    check_new_args_count(this, new_args);
+    return make_shared<QuantizedMaxPool>(new_args.at(0),
+                                         m_window_shape,
+                                         m_window_movement_strides,
+                                         m_padding_below,
+                                         m_padding_above,
+                                         new_args.at(1),
+                                         new_args.at(2));
+}
--- a/src/ngraph/runtime/cpu/op/quantized_max_pool.hpp
+++ b/src/ngraph/runtime/cpu/op/quantized_max_pool.hpp
+/*******************************************************************************
+* Copyright 2017-2018 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+#pragma once
+#include "ngraph/graph_util.hpp"
+#include "ngraph/op/op.hpp"
+namespace ngraph
+{
+    namespace op
+    {
+        class QuantizedMaxPool : public Op
+        {
+        public:
+            /// \brief Constructs a batched max pooling operation.
+            ///
+            /// \param arg The node producing the input data batch tensor.
+            /// \param window_shape The window shape.
+            /// \param window_movement_strides The window movement strides.
+            /// \param padding_below The below-padding shape.
+            /// \param padding_above The above-padding shape.
+            QuantizedMaxPool(const std::shared_ptr<Node>& arg,
+                             const Shape& window_shape,
+                             const Strides& window_movement_strides,
+                             const Shape& padding_below,
+                             const Shape& padding_above,
+                             const std::shared_ptr<Node> min,
+                             const std::shared_ptr<Node> max);
+            void validate_and_infer_types() override;
+            virtual std::shared_ptr<Node>
+                copy_with_new_args(const NodeVector& new_args) const override;
+            const Shape& get_window_shape() const { return m_window_shape; }
+            const Strides& get_window_movement_strides() const { return m_window_movement_strides; }
+            const Shape& get_padding_below() const { return m_padding_below; }
+            const Shape& get_padding_above() const { return m_padding_above; }
+        protected:
+            Shape m_window_shape;
+            Strides m_window_movement_strides;
+            Shape m_padding_below;
+            Shape m_padding_above;
+        };
+    }
+}
--- a/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
@@ -45,6 +45,8 @@
 #include "ngraph/runtime/cpu/op/group_conv.hpp"
 #include "ngraph/runtime/cpu/op/lstm.hpp"
 #include "ngraph/runtime/cpu/op/max_pool_with_indices.hpp"
+#include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
+#include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
 #include "ngraph/runtime/cpu/op/rnn.hpp"
 #include "ngraph/runtime/cpu/op/sigmoid.hpp"
@@ -670,6 +672,33 @@ namespace ngraph
                    }
                }
+                template <>
+                void CPUAssignment::ASSIGN_DECL(ngraph::op::QuantizedMaxPool)
+                {
+                    if (node->get_input_element_type(0) == element::u8 ||
+                        node->get_input_element_type(0) == element::i8)
+                    {
+                        auto quantized_mp = static_cast<op::QuantizedMaxPool*>(node);
+                        auto op_annotations =
+                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                        op_annotations->set_mkldnn_op(true);
+                        quantized_mp->set_op_annotations(op_annotations);
+                    }
+                }
+                template <>
+                void CPUAssignment::ASSIGN_DECL(ngraph::op::QuantizedAvgPool)
+                {
+                    if (node->get_input_element_type(0) == element::u8 ||
+                        node->get_input_element_type(0) == element::i8)
+                    {
+                        auto quantized_ap = static_cast<op::QuantizedAvgPool*>(node);
+                        auto op_annotations =
+                            std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
+                        op_annotations->set_mkldnn_op(true);
+                        quantized_ap->set_op_annotations(op_annotations);
+                    }
+                }
                template <>
                void CPUAssignment::ASSIGN_DECL(ngraph::op::BoundedRelu)
                {
@@ -754,6 +783,10 @@ static const runtime::cpu::pass::AssignOpMap s_dispatcher{
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::SigmoidBackprop>},
    {TI(ngraph::op::Lstm), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Lstm>},
    {TI(ngraph::op::Rnn), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Rnn>},
+    {TI(ngraph::op::QuantizedMaxPool),
+     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::QuantizedMaxPool>},
+    {TI(ngraph::op::QuantizedAvgPool),
+     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::QuantizedAvgPool>},
    {TI(ngraph::op::Softmax), &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Softmax>},
    {TI(ngraph::op::ConvolutionAdd),
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionAdd>},

--- a/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
--- a/test/quantize_cpu.cpp
+++ b/test/quantize_cpu.cpp
@@ -24,6 +24,8 @@
 #include "ngraph/op/constant.hpp"
 #include "ngraph/op/get_output_element.hpp"
 #include "ngraph/runtime/cpu/op/dequantize.hpp"
+#include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
+#include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
 #include "util/all_close.hpp"
 #include "util/all_close_f.hpp"
 #include "util/ndarray.hpp"
@@ -34,6 +36,134 @@
 using namespace std;
 using namespace ngraph;
+TEST(quantize_cpu, quantize_max_pool_2d_unsigned)
+{
+    vector<uint8_t> a_data = {0, 1, 0, 2, 1, 0, 3, 2, 0, 0, 2, 0, 0, 0, 1};
+    Shape shape_a{1, 1, 3, 5};
+    Shape window_shape{2, 3};
+    auto window_movement_strides = Strides{1, 1};
+    Shape padding_below{0, 0};
+    Shape padding_above{0, 0};
+    auto A = make_shared<op::Parameter>(element::u8, shape_a);
+    Shape shape_r{1, 1, 2, 3};
+    auto B = op::Constant::create(element::f32, Shape{1}, {0.0f});
+    auto C = op::Constant::create(element::f32, Shape{1}, {255.0f});
+    auto QMP = make_shared<op::QuantizedMaxPool>(
+        A, window_shape, window_movement_strides, padding_below, padding_above, B, C);
+    auto output_data = std::make_shared<op::GetOutputElement>(QMP, 0);
+    auto output_min = std::make_shared<op::GetOutputElement>(QMP, 1);
+    auto output_max = std::make_shared<op::GetOutputElement>(QMP, 2);
+    auto f = make_shared<Function>(NodeVector{output_data, output_min, output_max},
+                                   op::ParameterVector{A});
+    auto backend = runtime::Backend::create("CPU");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::u8, shape_a);
+    copy_data(a, a_data);
+    auto result = backend->create_tensor(element::u8, shape_r);
+    auto result_min = backend->create_tensor(element::f32, Shape{1});
+    auto result_max = backend->create_tensor(element::f32, Shape{1});
+    backend->call_with_validate(f, {result, result_min, result_max}, {a});
+    EXPECT_EQ((vector<uint8_t>{3, 3, 2, 3, 3, 2}), read_vector<uint8_t>(result));
+    EXPECT_EQ((vector<float>{0.0}), read_vector<float>(result_min));
+    EXPECT_EQ((vector<float>{255.0}), read_vector<float>(result_max));
+}
+TEST(quantize_cpu, quantize_max_pool_2d_signed)
+{
+    vector<int8_t> a_data = {0, 1, 0, -2, 1, 0, -3, 2, 0, 0, 2, 0, 0, 0, 1};
+    Shape shape_a{1, 1, 3, 5};
+    Shape window_shape{2, 3};
+    auto window_movement_strides = Strides{1, 1};
+    Shape padding_below{0, 0};
+    Shape padding_above{0, 0};
+    auto A = make_shared<op::Parameter>(element::i8, shape_a);
+    Shape shape_r{1, 1, 2, 3};
+    auto B = op::Constant::create(element::f32, Shape{1}, {0.0f});
+    auto C = op::Constant::create(element::f32, Shape{1}, {127.0f});
+    auto QMP = make_shared<op::QuantizedMaxPool>(
+        A, window_shape, window_movement_strides, padding_below, padding_above, B, C);
+    auto output_data = std::make_shared<op::GetOutputElement>(QMP, 0);
+    auto output_min = std::make_shared<op::GetOutputElement>(QMP, 1);
+    auto output_max = std::make_shared<op::GetOutputElement>(QMP, 2);
+    auto f = make_shared<Function>(NodeVector{output_data, output_min, output_max},
+                                   op::ParameterVector{A});
+    auto backend = runtime::Backend::create("CPU");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::i8, shape_a);
+    copy_data(a, a_data);
+    auto result = backend->create_tensor(element::i8, shape_r);
+    auto result_min = backend->create_tensor(element::f32, Shape{1});
+    auto result_max = backend->create_tensor(element::f32, Shape{1});
+    backend->call_with_validate(f, {result, result_min, result_max}, {a});
+    EXPECT_EQ((vector<int8_t>{2, 2, 2, 2, 2, 2}), read_vector<int8_t>(result));
+    EXPECT_EQ((vector<float>{0.0}), read_vector<float>(result_min));
+    EXPECT_EQ((vector<float>{127.0}), read_vector<float>(result_max));
+}
+TEST(quantize_cpu, quantize_avg_pool_2d_unsigned)
+{
+    vector<uint8_t> a_data = {0, 1, 0, 2, 1, 0, 3, 2, 0, 0, 2, 0, 0, 0, 1};
+    Shape shape_a{1, 1, 3, 5};
+    Shape window_shape{2, 3};
+    auto window_movement_strides = Strides{1, 1};
+    Shape padding_below{0, 0};
+    Shape padding_above{0, 0};
+    auto A = make_shared<op::Parameter>(element::u8, shape_a);
+    Shape shape_r{1, 1, 2, 3};
+    auto B = op::Constant::create(element::f32, Shape{1}, {0.0f});
+    auto C = op::Constant::create(element::f32, Shape{1}, {255.0f});
+    auto QMP = make_shared<op::QuantizedAvgPool>(
+        A, window_shape, window_movement_strides, padding_below, padding_above, false, B, C);
+    auto output_data = std::make_shared<op::GetOutputElement>(QMP, 0);
+    auto output_min = std::make_shared<op::GetOutputElement>(QMP, 1);
+    auto output_max = std::make_shared<op::GetOutputElement>(QMP, 2);
+    auto f = make_shared<Function>(NodeVector{output_data, output_min, output_max},
+                                   op::ParameterVector{A});
+    auto backend = runtime::Backend::create("CPU");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::u8, shape_a);
+    copy_data(a, a_data);
+    auto result = backend->create_tensor(element::u8, shape_r);
+    auto result_min = backend->create_tensor(element::f32, Shape{1});
+    auto result_max = backend->create_tensor(element::f32, Shape{1});
+    backend->call_with_validate(f, {result, result_min, result_max}, {a});
+    EXPECT_EQ((vector<uint8_t>{1, 1, 1, 1, 1, 0}), read_vector<uint8_t>(result));
+    EXPECT_EQ((vector<float>{0.0}), read_vector<float>(result_min));
+    EXPECT_EQ((vector<float>{255.0}), read_vector<float>(result_max));
+}
+TEST(quantize_cpu, quantize_avg_pool_2d_signed)
+{
+    vector<int8_t> a_data = {10, 1, 0, -2, 1, 0, -3, 4, 0, 0, 2, 0, 0, 0, 1};
+    Shape shape_a{1, 1, 3, 5};
+    Shape window_shape{2, 3};
+    auto window_movement_strides = Strides{1, 1};
+    Shape padding_below{0, 0};
+    Shape padding_above{0, 0};
+    auto A = make_shared<op::Parameter>(element::i8, shape_a);
+    Shape shape_r{1, 1, 2, 3};
+    auto B = op::Constant::create(element::f32, Shape{1}, {0.0f});
+    auto C = op::Constant::create(element::f32, Shape{1}, {127.0f});
+    auto QMP = make_shared<op::QuantizedAvgPool>(
+        A, window_shape, window_movement_strides, padding_below, padding_above, false, B, C);
+    auto output_data = std::make_shared<op::GetOutputElement>(QMP, 0);
+    auto output_min = std::make_shared<op::GetOutputElement>(QMP, 1);
+    auto output_max = std::make_shared<op::GetOutputElement>(QMP, 2);
+    auto f = make_shared<Function>(NodeVector{output_data, output_min, output_max},
+                                   op::ParameterVector{A});
+    auto backend = runtime::Backend::create("CPU");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::i8, shape_a);
+    copy_data(a, a_data);
+    auto result = backend->create_tensor(element::i8, shape_r);
+    auto result_min = backend->create_tensor(element::f32, Shape{1});
+    auto result_max = backend->create_tensor(element::f32, Shape{1});
+    backend->call_with_validate(f, {result, result_min, result_max}, {a});
+    EXPECT_EQ((vector<int8_t>{2, 0, 0, 0, 0, 1}), read_vector<int8_t>(result));
+    EXPECT_EQ((vector<float>{0.0}), read_vector<float>(result_min));
+    EXPECT_EQ((vector<float>{127.0}), read_vector<float>(result_max));
+}
 template <typename T>
 void DequantizeTest(int input, float min, float max, float expected_output)
 {