QuantizedDot and QuantizedDotBias ops CPU impl (#2592)

* QuantizedDot and QuantizedDotBias ops CPU impl * add builders and unittests * fix CI issue * fix GPU emitter build * Remove duplicate line.

QuantizedDot and QuantizedDotBias ops CPU impl (#2592)
* QuantizedDot and QuantizedDotBias ops CPU impl * add builders and unittests * fix CI issue * fix GPU emitter build * Remove duplicate line.
1a6f8487 · Ashok Emani · Scott Cyphers · 60ca608c · 1a6f8487 · 1a6f8487
Commit 1a6f8487 authored Mar 14, 2019 by Ashok Emani Committed by Scott Cyphers Mar 14, 2019
23 changed files
--- a/src/ngraph/CMakeLists.txt
+++ b/src/ngraph/CMakeLists.txt
@@ -140,6 +140,10 @@ set (SRC
    op/experimental/quantized_max_pool.hpp
    op/experimental/shape_of.cpp
    op/experimental/shape_of.hpp
+    op/experimental/quantized_dot.cpp
+    op/experimental/quantized_dot.hpp
+    op/experimental/quantized_dot_bias.cpp
+    op/experimental/quantized_dot_bias.hpp
    op/floor.cpp
    op/floor.hpp
    op/get_output_element.cpp

--- a/src/ngraph/builder/quantization.cpp
+++ b/src/ngraph/builder/quantization.cpp
@@ -399,5 +399,69 @@ namespace ngraph
                                                                            with_relu);
            return make_shared<op::Convert>(qconv, element::u8);
        }
-    }
-}
+        std::shared_ptr<Node> ScaledQuantizedDotBias(std::shared_ptr<Node> input,
+                                                     std::shared_ptr<Node> filters,
+                                                     std::shared_ptr<Node> bias,
+                                                     std::shared_ptr<Node> min_input,
+                                                     std::shared_ptr<Node> max_input,
+                                                     std::shared_ptr<Node> min_filter,
+                                                     std::shared_ptr<Node> max_filter,
+                                                     std::shared_ptr<Node> min_freezed_output,
+                                                     std::shared_ptr<Node> max_freezed_output,
+                                                     const bool requantize,
+                                                     const bool with_relu)
+        {
+            auto requantization_scale =
+                quantization_util::get_dot_scale(min_input,
+                                                 max_input,
+                                                 min_filter,
+                                                 max_filter,
+                                                 min_freezed_output,
+                                                 max_freezed_output,
+                                                 input->get_element_type(),
+                                                 with_relu ? element::u8 : element::i8,
+                                                 requantize);
+            if (bias->get_element_type() != element::i32)
+            {
+                auto zero = make_constant(element::i32, min_input->get_shape(), 0);
+                AxisSet quantization_axes;
+                auto bias_scale =
+                    quantization_util::get_bias_scale(min_input, max_input, min_filter, max_filter);
+                op::Quantize::RoundMode round_mode =
+                    op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_EVEN;
+                bias = make_shared<op::Quantize>(
+                    bias, bias_scale, zero, element::i32, quantization_axes, round_mode);
+            }
+            return make_shared<op::QuantizedDotBias>(
+                input, filters, bias, requantization_scale, requantize, with_relu);
+        }
+        std::shared_ptr<Node> ScaledQuantizedDot(std::shared_ptr<Node> input,
+                                                 std::shared_ptr<Node> filters,
+                                                 std::shared_ptr<Node> min_input,
+                                                 std::shared_ptr<Node> max_input,
+                                                 std::shared_ptr<Node> min_filter,
+                                                 std::shared_ptr<Node> max_filter,
+                                                 std::shared_ptr<Node> min_freezed_output,
+                                                 std::shared_ptr<Node> max_freezed_output,
+                                                 const bool requantize,
+                                                 const bool with_relu)
+        {
+            auto requantization_scale =
+                quantization_util::get_dot_scale(min_input,
+                                                 max_input,
+                                                 min_filter,
+                                                 max_filter,
+                                                 min_freezed_output,
+                                                 max_freezed_output,
+                                                 input->get_element_type(),
+                                                 with_relu ? element::u8 : element::i8,
+                                                 requantize);
+            return make_shared<op::QuantizedDot>(
+                input, filters, requantization_scale, requantize, with_relu);
+        }
+    } // namespace builder
+} // namespace ngraph
--- a/src/ngraph/builder/quantization.hpp
+++ b/src/ngraph/builder/quantization.hpp
@@ -24,6 +24,8 @@
 #include "ngraph/op/experimental/quantized_conv.hpp"
 #include "ngraph/op/experimental/quantized_conv_bias.hpp"
 #include "ngraph/op/experimental/quantized_conv_relu.hpp"
+#include "ngraph/op/experimental/quantized_dot.hpp"
+#include "ngraph/op/experimental/quantized_dot_bias.hpp"
 #include "ngraph/op/experimental/quantized_max_pool.hpp"
 #include "ngraph/op/quantize.hpp"
@@ -151,5 +153,29 @@ namespace ngraph
                                                    std::shared_ptr<Node> min_freezed_output_conv_2,
                                                    std::shared_ptr<Node> max_freezed_output_conv_2,
                                                    const bool with_relu);
-    }
-}
+        std::shared_ptr<Node> ScaledQuantizedDotBias(std::shared_ptr<Node> input,
+                                                     std::shared_ptr<Node> filters,
+                                                     std::shared_ptr<Node> bias,
+                                                     std::shared_ptr<Node> min_input,
+                                                     std::shared_ptr<Node> max_input,
+                                                     std::shared_ptr<Node> min_filter,
+                                                     std::shared_ptr<Node> max_filter,
+                                                     std::shared_ptr<Node> min_freezed_output,
+                                                     std::shared_ptr<Node> max_freezed_output,
+                                                     const bool requantize = true,
+                                                     const bool with_relu = false);
+        std::shared_ptr<Node> ScaledQuantizedDot(std::shared_ptr<Node> input,
+                                                 std::shared_ptr<Node> filters,
+                                                 std::shared_ptr<Node> min_input,
+                                                 std::shared_ptr<Node> max_input,
+                                                 std::shared_ptr<Node> min_filter,
+                                                 std::shared_ptr<Node> max_filter,
+                                                 std::shared_ptr<Node> min_freezed_output,
+                                                 std::shared_ptr<Node> max_freezed_output,
+                                                 const bool requantize = true,
+                                                 const bool with_relu = false);
+    } // namespace builder
+} // namespace ngraph
--- a/src/ngraph/builder/quantization_util.hpp
+++ b/src/ngraph/builder/quantization_util.hpp
@@ -276,6 +276,47 @@ namespace ngraph
                    }
                }
            }
-        }
-    }
+            std::shared_ptr<Node> get_dot_scale(std::shared_ptr<Node> min_input,
-}
+                                                std::shared_ptr<Node> max_input,
+                                                std::shared_ptr<Node> min_filter,
+                                                std::shared_ptr<Node> max_filter,
+                                                std::shared_ptr<Node> min_freezed_output,
+                                                std::shared_ptr<Node> max_freezed_output,
+                                                const ngraph::element::Type& input_type,
+                                                const ngraph::element::Type& output_type,
+                                                const bool requantize = true)
+            {
+                auto type = min_input->get_element_type();
+                if (type != max_input->get_element_type() ||
+                    type != min_filter->get_element_type() ||
+                    type != max_filter->get_element_type() ||
+                    type != min_freezed_output->get_element_type() ||
+                    type != max_freezed_output->get_element_type())
+                {
+                    throw ngraph_error("get_dot_scale: min and max must have same type");
+                }
+                auto shape = min_input->get_shape();
+                if (shape != max_input->get_shape() || shape != min_filter->get_shape() ||
+                    shape != max_filter->get_shape() || shape != min_freezed_output->get_shape() ||
+                    shape != max_freezed_output->get_shape())
+                {
+                    throw ngraph_error("get_dot_scale: min and max must have same shape");
+                }
+                auto data_scale = get_scale(min_input, max_input, input_type);
+                auto weight_scale = get_scale(min_filter, max_filter, element::i8);
+                auto out_scale = get_scale(min_freezed_output, max_freezed_output, output_type);
+                if (requantize)
+                {
+                    return data_scale * weight_scale / out_scale;
+                }
+                else
+                {
+                    return data_scale * weight_scale;
+                }
+            }
+        } // namespace quantization_util
+    }     // namespace builder
+} // namespace ngraph
--- a/src/ngraph/op/experimental/quantized_dot.cpp
+++ b/src/ngraph/op/experimental/quantized_dot.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include <functional>
+#include <memory>
+#include <utility>
+#include "ngraph/shape.hpp"
+#include "quantized_dot.hpp"
+using namespace std;
+using namespace ngraph;
+op::QuantizedDot::QuantizedDot(const shared_ptr<Node>& data,
+                               const shared_ptr<Node>& weights,
+                               const shared_ptr<Node>& scale,
+                               bool requantize,
+                               bool with_relu)
+    : Op("QuantizedDot", check_single_output_args({data, weights, scale}))
+    , m_requantize(requantize)
+    , m_with_relu(with_relu)
+{
+    constructor_validate_and_infer_types();
+    auto& data_shape = data->get_shape();
+    auto& weights_shape = weights->get_shape();
+    NODE_VALIDATION_CHECK(this,
+                          data_shape.size() == 2 && weights_shape.size() == 2 &&
+                              data_shape[1] == weights_shape[1],
+                          "only valid tensors of rank 2 supported. data shape ",
+                          data_shape,
+                          " weights shape ",
+                          weights_shape);
+    auto output_et = requantize ? (with_relu ? element::u8 : element::i8) : element::f32;
+    set_output_type(0, output_et, Shape{data_shape[0], weights_shape[0]});
+}
--- a/src/ngraph/op/experimental/quantized_dot.hpp
+++ b/src/ngraph/op/experimental/quantized_dot.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include <utility>
+#include "ngraph/op/op.hpp"
+namespace ngraph
+{
+    namespace op
+    {
+        class QuantizedDot : public Op
+        {
+        public:
+            QuantizedDot(const std::shared_ptr<Node>& data,
+                         const std::shared_ptr<Node>& weights,
+                         const std::shared_ptr<Node>& scale,
+                         bool requantize = true,
+                         bool with_relu = false);
+            virtual std::shared_ptr<Node>
+                copy_with_new_args(const NodeVector& new_args) const override
+            {
+                check_new_args_count(this, new_args);
+                return std::make_shared<QuantizedDot>(
+                    new_args.at(0), new_args.at(1), new_args.at(2), m_requantize, m_with_relu);
+            }
+            bool with_relu() const { return m_with_relu; }
+            bool requantize() const { return m_requantize; }
+        protected:
+            bool m_requantize;
+            bool m_with_relu;
+        };
+    } // namespace op
+} // namespace ngraph
--- a/src/ngraph/op/experimental/quantized_dot_bias.cpp
+++ b/src/ngraph/op/experimental/quantized_dot_bias.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include <functional>
+#include <memory>
+#include <utility>
+#include "ngraph/shape.hpp"
+#include "quantized_dot_bias.hpp"
+using namespace std;
+using namespace ngraph;
+op::QuantizedDotBias::QuantizedDotBias(const shared_ptr<Node>& data,
+                                       const shared_ptr<Node>& weights,
+                                       const shared_ptr<Node>& bias,
+                                       const shared_ptr<Node>& scale,
+                                       bool requantize,
+                                       bool with_relu)
+    : Op("QuantizedDotBias", check_single_output_args({data, weights, bias, scale}))
+    , m_requantize(requantize)
+    , m_with_relu(with_relu)
+{
+    constructor_validate_and_infer_types();
+    auto& data_shape = data->get_shape();
+    auto& weights_shape = weights->get_shape();
+    auto& bias_shape = bias->get_shape();
+    NODE_VALIDATION_CHECK(this,
+                          data_shape.size() == 2 && weights_shape.size() == 2 &&
+                              data_shape[1] == weights_shape[1],
+                          "only valid tensors of rank 2 supported. data ",
+                          data_shape,
+                          " weights ",
+                          weights_shape);
+    NODE_VALIDATION_CHECK(this,
+                          bias_shape.size() == 1 && bias_shape[0] == weights_shape[0],
+                          "invalid bias ",
+                          bias_shape);
+    auto output_et = requantize ? (with_relu ? element::u8 : element::i8) : element::f32;
+    set_output_type(0, output_et, Shape{data_shape[0], weights_shape[0]});
+}
--- a/src/ngraph/op/experimental/quantized_dot_bias.hpp
+++ b/src/ngraph/op/experimental/quantized_dot_bias.hpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include <utility>
+#include "ngraph/op/op.hpp"
+namespace ngraph
+{
+    namespace op
+    {
+        class QuantizedDotBias : public Op
+        {
+        public:
+            QuantizedDotBias(const std::shared_ptr<Node>& data,
+                             const std::shared_ptr<Node>& weights,
+                             const std::shared_ptr<Node>& bias,
+                             const std::shared_ptr<Node>& scale,
+                             bool requantize = true,
+                             bool with_relu = false);
+            virtual std::shared_ptr<Node>
+                copy_with_new_args(const NodeVector& new_args) const override
+            {
+                check_new_args_count(this, new_args);
+                return std::make_shared<QuantizedDotBias>(new_args.at(0),
+                                                          new_args.at(1),
+                                                          new_args.at(2),
+                                                          new_args.at(3),
+                                                          m_requantize,
+                                                          m_with_relu);
+            }
+            bool with_relu() const { return m_with_relu; }
+            bool requantize() const { return m_requantize; }
+        protected:
+            bool m_requantize;
+            bool m_with_relu;
+        };
+    } // namespace op
+} // namespace ngraph
--- a/src/ngraph/op/op_tbl.hpp
+++ b/src/ngraph/op/op_tbl.hpp
@@ -115,6 +115,8 @@ NGRAPH_OP(QuantizedConvolutionBiasAdd, ngraph::op)
 NGRAPH_OP(QuantizedConvolutionBiasSignedAdd, ngraph::op)
 NGRAPH_OP(QuantizedConvolutionRelu, ngraph::op)
 NGRAPH_OP(QuantizedConvolution, ngraph::op)
+NGRAPH_OP(QuantizedDotBias, ngraph::op)
+NGRAPH_OP(QuantizedDot, ngraph::op)
 NGRAPH_OP(QuantizedMaxPool, ngraph::op)
 NGRAPH_OP(Relu, ngraph::op)
 NGRAPH_OP(ReluBackprop, ngraph::op)

--- a/src/ngraph/runtime/cpu/CMakeLists.txt
+++ b/src/ngraph/runtime/cpu/CMakeLists.txt
@@ -42,8 +42,6 @@ set(SRC
    builder/concat.cpp
    builder/convert.cpp
    builder/convert_layout.cpp
-    builder/quantized_conv.cpp
-    builder/quantized_concat.cpp
    builder/convolution.cpp
    builder/dot.cpp
    builder/embedding_lookup.cpp
@@ -63,6 +61,8 @@ set(SRC
    builder/quantization.cpp
    builder/quantized_avg_pool.cpp
    builder/quantized_conv.cpp
+    builder/quantized_concat.cpp
+    builder/quantized_dot.cpp
    builder/quantized_max_pool.cpp
    builder/reshape.cpp
    builder/reverse.cpp

--- a/src/ngraph/runtime/cpu/builder/quantized_dot.cpp
+++ b/src/ngraph/runtime/cpu/builder/quantized_dot.cpp
+//*****************************************************************************
+// Copyright 2017-2019 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#include "ngraph/op/experimental/quantized_dot.hpp"
+#include "ngraph/op/constant.hpp"
+#include "ngraph/op/experimental/quantized_dot_bias.hpp"
+#include "ngraph/runtime/cpu/cpu_builder.hpp"
+#include "ngraph/runtime/cpu/cpu_executor.hpp"
+#include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
+#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
+using namespace std;
+using namespace ngraph;
+namespace ngraph
+{
+    namespace runtime
+    {
+        namespace cpu
+        {
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::QuantizedDotBias)
+            {
+                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
+                {
+                    auto& functors = external_function->get_functors();
+                    auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name());
+                    auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
+                    auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name());
+                    auto& arg3_tensor = external_function->get_tensor_data(args[3].get_name());
+                    auto& out0_tensor = external_function->get_tensor_data(out[0].get_name());
+                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                    auto scales_size = shape_size(args[3].get_shape());
+                    auto ip_desc =
+                        mkldnn_emitter
+                            ->get_inner_product_forward_desc<ngraph::op::QuantizedDotBias>(node);
+                    auto ip_attr =
+                        mkldnn_emitter
+                            ->get_inner_product_forward_attr<ngraph::op::QuantizedDotBias>(node);
+                    size_t ip_index = mkldnn_emitter->inner_product_forward_init(true);
+                    auto& deps = mkldnn_emitter->get_primitive_deps(ip_index);
+                    auto functor = [&, scales_size, ip_desc, ip_attr, deps, ip_index](
+                        CPURuntimeContext* ctx, CPUExecutionContext* ectx) mutable {
+                        if (ctx->first_iteration)
+                        {
+                            vector<float> dyn_scales;
+                            dyn_scales.assign(static_cast<float*>(arg3_tensor),
+                                              static_cast<float*>(arg3_tensor) + scales_size);
+                            ip_attr.set_output_scales(0, dyn_scales);
+                            mkldnn_emitter->build_inner_product_forward<true>(
+                                ip_desc, ip_attr, executor::global_cpu_engine, ip_index);
+                        }
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], arg2_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[3], out0_tensor);
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, ip_index);
+                    };
+                    functors.emplace_back(functor);
+                }
+                else
+                {
+                    throw ngraph_error("unsupported parameters for QuantizedDotBias via DEX");
+                }
+            }
+            template <>
+            void Builder::BUILDER_DECL(ngraph::op::QuantizedDot)
+            {
+                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
+                {
+                    auto& functors = external_function->get_functors();
+                    auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name());
+                    auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
+                    auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name());
+                    auto& out0_tensor = external_function->get_tensor_data(out[0].get_name());
+                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                    auto scales_size = shape_size(args[2].get_shape());
+                    auto ip_desc =
+                        mkldnn_emitter->get_inner_product_forward_desc<ngraph::op::QuantizedDot>(
+                            node);
+                    auto ip_attr =
+                        mkldnn_emitter->get_inner_product_forward_attr<ngraph::op::QuantizedDot>(
+                            node);
+                    size_t ip_index = mkldnn_emitter->inner_product_forward_init(false);
+                    auto& deps = mkldnn_emitter->get_primitive_deps(ip_index);
+                    auto functor = [&, scales_size, ip_desc, ip_attr, deps, ip_index](
+                        CPURuntimeContext* ctx, CPUExecutionContext* ectx) mutable {
+                        if (ctx->first_iteration)
+                        {
+                            vector<float> dyn_scales;
+                            dyn_scales.assign(static_cast<float*>(arg2_tensor),
+                                              static_cast<float*>(arg2_tensor) + scales_size);
+                            ip_attr.set_output_scales(0, dyn_scales);
+                            mkldnn_emitter->build_inner_product_forward<false>(
+                                ip_desc, ip_attr, executor::global_cpu_engine, ip_index);
+                        }
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
+                        cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out0_tensor);
+                        cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, ip_index);
+                    };
+                    functors.emplace_back(functor);
+                }
+                else
+                {
+                    throw ngraph_error("unsupported parameters for QuantizedDot via DEX");
+                }
+            }
+            REGISTER_OP_BUILDER(QuantizedDotBias);
+            REGISTER_OP_BUILDER(QuantizedDot);
+        }
+    }
+}
--- a/src/ngraph/runtime/cpu/cpu_emitter.cpp
+++ b/src/ngraph/runtime/cpu/cpu_emitter.cpp
@@ -55,6 +55,8 @@
 #include "ngraph/op/experimental/quantized_concat.hpp"
 #include "ngraph/op/experimental/quantized_conv_bias.hpp"
 #include "ngraph/op/experimental/quantized_conv_relu.hpp"
+#include "ngraph/op/experimental/quantized_dot.hpp"
+#include "ngraph/op/experimental/quantized_dot_bias.hpp"
 #include "ngraph/op/experimental/quantized_max_pool.hpp"
 #include "ngraph/op/floor.hpp"
 #include "ngraph/op/get_output_element.hpp"
@@ -2498,6 +2500,60 @@ namespace ngraph
                }
            }
+            template <>
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::QuantizedDotBias)
+            {
+                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
+                {
+                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                    auto qip_index =
+                        mkldnn_emitter->build_inner_product<ngraph::op::QuantizedDotBias>(
+                            node, args, out);
+                    auto& deps = mkldnn_emitter->get_primitive_deps(qip_index);
+                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0])
+                           << ", " << args[0].get_name() << ");\n";
+                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1])
+                           << ", " << args[1].get_name() << ");\n";
+                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[2])
+                           << ", " << args[2].get_name() << ");\n";
+                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[3])
+                           << ", " << out[0].get_name() << ");\n";
+                    writer << "cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, "
+                           << to_string(qip_index) << ");\n";
+                }
+                else
+                {
+                    throw ngraph_error("QuantizedDotBias is only supported with MKLDNN kernel.");
+                }
+            }
+            template <>
+            void CPU_Emitter::EMITTER_DECL(ngraph::op::QuantizedDot)
+            {
+                if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
+                {
+                    auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
+                    auto qip_index = mkldnn_emitter->build_inner_product<ngraph::op::QuantizedDot>(
+                        node, args, out);
+                    auto& deps = mkldnn_emitter->get_primitive_deps(qip_index);
+                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0])
+                           << ", " << args[0].get_name() << ");\n";
+                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1])
+                           << ", " << args[1].get_name() << ");\n";
+                    writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[2])
+                           << ", " << out[0].get_name() << ");\n";
+                    writer << "cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, "
+                           << to_string(qip_index) << ");\n";
+                }
+                else
+                {
+                    throw ngraph_error("unsupported parameters for QuantizedDot");
+                }
+            }
            template <>
            void CPU_Emitter::EMITTER_DECL(ngraph::op::ConvolutionBias)
            {

--- a/src/ngraph/runtime/cpu/cpu_external_function.cpp
+++ b/src/ngraph/runtime/cpu/cpu_external_function.cpp
@@ -72,6 +72,8 @@
 #include "ngraph/op/experimental/quantized_conv.hpp"
 #include "ngraph/op/experimental/quantized_conv_bias.hpp"
 #include "ngraph/op/experimental/quantized_conv_relu.hpp"
+#include "ngraph/op/experimental/quantized_dot.hpp"
+#include "ngraph/op/experimental/quantized_dot_bias.hpp"
 #include "ngraph/op/experimental/quantized_max_pool.hpp"
 #include "ngraph/op/floor.hpp"
 #include "ngraph/op/get_output_element.hpp"
@@ -353,6 +355,8 @@ static const runtime::cpu::OpMap dispatcher{
     &runtime::cpu::CPU_Emitter::emit<op::QuantizedConvolutionBiasAdd>},
    {TI(ngraph::op::QuantizedConvolutionBiasSignedAdd),
     &runtime::cpu::CPU_Emitter::emit<op::QuantizedConvolutionBiasSignedAdd>},
+    {TI(ngraph::op::QuantizedDotBias), &runtime::cpu::CPU_Emitter::emit<op::QuantizedDotBias>},
+    {TI(ngraph::op::QuantizedDot), &runtime::cpu::CPU_Emitter::emit<op::QuantizedDot>},
    {TI(ngraph::op::ConvolutionRelu), &runtime::cpu::CPU_Emitter::emit<op::ConvolutionRelu>},
    {TI(ngraph::op::QuantizedConvolution),
     &runtime::cpu::CPU_Emitter::emit<op::QuantizedConvolution>},

--- a/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.cpp
@@ -1988,6 +1988,24 @@ size_t MKLDNNEmitter::convolution_forward_init(bool with_bias)
    return m_mkldnn_primitives.size() - 1;
 }
+size_t MKLDNNEmitter::inner_product_forward_init(bool with_bias)
+{
+    size_t size = m_mkldnn_primitives.size();
+    if (with_bias)
+    {
+        // Inputs, Weights, Bias, Results, inner_product
+        m_mkldnn_primitives.resize(size + 5, nullptr);
+        m_primitive_deps[m_mkldnn_primitives.size() - 1] = {size, size + 1, size + 2, size + 3};
+    }
+    else
+    {
+        // Inputs, Weights, Results, inner_product
+        m_mkldnn_primitives.resize(size + 4, nullptr);
+        m_primitive_deps[m_mkldnn_primitives.size() - 1] = {size, size + 1, size + 2};
+    }
+    return m_mkldnn_primitives.size() - 1;
+}
 size_t MKLDNNEmitter::reserve_primitive_space(size_t count, bool new_workspace)
 {
    size_t size = m_mkldnn_primitives.size();
@@ -2002,3 +2020,76 @@ size_t MKLDNNEmitter::reserve_primitive_space(size_t count, bool new_workspace)
    }
    return m_mkldnn_primitives.size() - 1;
 }
+size_t MKLDNNEmitter::build_quantized_inner_product_forward(
+    const mkldnn::memory::desc& input_data_desc,
+    const mkldnn::memory::desc& weights_desc,
+    const mkldnn::memory::desc& bias_desc,
+    const mkldnn::memory::desc& result_desc,
+    const float scale,
+    const mkldnn::post_ops& pops)
+{
+    size_t input_data_index = build_memory_primitive(input_data_desc);
+    size_t weights_index = build_memory_primitive(weights_desc);
+    size_t bias_index = build_memory_primitive(bias_desc);
+    size_t result_index = build_memory_primitive(result_desc);
+    std::vector<float> output_scale;
+    output_scale.push_back(scale);
+    // mkldnn inner_product attr
+    mkldnn::primitive_attr ip_attr;
+    ip_attr.set_post_ops(pops);
+    /* Specify the rounding mode */
+    ip_attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
+    /* Specify the scales array and corresponding mask */
+    ip_attr.set_output_scales(0, output_scale);
+    // mkldnn inner_product
+    size_t ip_index =
+        insert_primitive(new mkldnn::inner_product_forward({{
+                                                                mkldnn::prop_kind::forward_scoring,
+                                                                input_data_desc,
+                                                                weights_desc,
+                                                                bias_desc,
+                                                                result_desc,
+                                                            },
+                                                            ip_attr,
+                                                            executor::global_cpu_engine},
+                                                           *m_mkldnn_primitives[input_data_index],
+                                                           *m_mkldnn_primitives[weights_index],
+                                                           *m_mkldnn_primitives[bias_index],
+                                                           *m_mkldnn_primitives[result_index]));
+    m_primitive_deps[ip_index] = {input_data_index, weights_index, bias_index, result_index};
+    return ip_index;
+}
+size_t MKLDNNEmitter::build_quantized_inner_product_forward(
+    const mkldnn::memory::desc& input_data_desc,
+    const mkldnn::memory::desc& weights_desc,
+    const mkldnn::memory::desc& result_desc,
+    const float scale,
+    const mkldnn::post_ops& pops)
+{
+    size_t input_data_index = build_memory_primitive(input_data_desc);
+    size_t weights_index = build_memory_primitive(weights_desc);
+    size_t result_index = build_memory_primitive(result_desc);
+    std::vector<float> output_scale;
+    output_scale.push_back(scale);
+    // mkldnn inner_product attr
+    mkldnn::primitive_attr ip_attr;
+    ip_attr.set_post_ops(pops);
+    /* Specify the rounding mode */
+    ip_attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
+    /* Specify the scales array and corresponding mask */
+    ip_attr.set_output_scales(0, output_scale);
+    // mkldnn inner_product
+    size_t ip_index = insert_primitive(new mkldnn::inner_product_forward(
+        {{
+             mkldnn::prop_kind::forward_scoring, input_data_desc, weights_desc, result_desc,
+         },
+         ip_attr,
+         executor::global_cpu_engine},
+        *m_mkldnn_primitives[input_data_index],
+        *m_mkldnn_primitives[weights_index],
+        *m_mkldnn_primitives[result_index]));
+    m_primitive_deps[ip_index] = {input_data_index, weights_index, result_index};
+    return ip_index;
+}
--- a/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
+++ b/src/ngraph/runtime/cpu/mkldnn_emitter.hpp
@@ -37,6 +37,8 @@
 #include "ngraph/op/experimental/quantized_conv.hpp"
 #include "ngraph/op/experimental/quantized_conv_bias.hpp"
 #include "ngraph/op/experimental/quantized_conv_relu.hpp"
+#include "ngraph/op/experimental/quantized_dot.hpp"
+#include "ngraph/op/experimental/quantized_dot_bias.hpp"
 #include "ngraph/op/experimental/quantized_max_pool.hpp"
 #include "ngraph/op/lrn.hpp"
 #include "ngraph/op/max_pool.hpp"
@@ -154,6 +156,21 @@ namespace ngraph
                    const float scale,
                    const mkldnn::post_ops& pops = mkldnn::post_ops());
+                size_t build_quantized_inner_product_forward(
+                    const mkldnn::memory::desc& input_data_desc,
+                    const mkldnn::memory::desc& weights_desc,
+                    const mkldnn::memory::desc& result_desc,
+                    const float scale,
+                    const mkldnn::post_ops& pops = mkldnn::post_ops());
+                size_t build_quantized_inner_product_forward(
+                    const mkldnn::memory::desc& input_data_desc,
+                    const mkldnn::memory::desc& weights_desc,
+                    const mkldnn::memory::desc& bias_desc,
+                    const mkldnn::memory::desc& result_desc,
+                    const float scale,
+                    const mkldnn::post_ops& pops = mkldnn::post_ops());
                // TODO (nbpatel) Templatize the return type when we have double scales
                template <typename OP>
                std::vector<float> extract_scale_value(const ngraph::Node* node, int index)
@@ -288,6 +305,56 @@ namespace ngraph
                    }
                }
+                template <typename OP>
+                size_t build_inner_product(const ngraph::Node* node,
+                                           const std::vector<TensorViewWrapper>& args,
+                                           const std::vector<TensorViewWrapper>& out)
+                {
+                    auto data_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
+                    auto weights_desc = mkldnn_utils::get_input_mkldnn_md(node, 1);
+                    // MKLDNN relies on named formats for kernel selection
+                    if (weights_desc.data.format == mkldnn_nchw)
+                    {
+                        weights_desc.data.format = mkldnn_oihw;
+                    }
+                    if (weights_desc.data.format == mkldnn_ncdhw)
+                    {
+                        weights_desc.data.format = mkldnn_oidhw;
+                    }
+                    auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
+                    mkldnn::post_ops ops;
+                    if (has_relu<OP>(node))
+                    {
+                        const float ops_scale = 1.f;
+                        const float ops_alpha = -0.f; // relu negative slope
+                        const float ops_beta = 0.f;
+                        ops.append_eltwise(
+                            ops_scale, mkldnn::algorithm::eltwise_relu, ops_alpha, ops_beta);
+                    }
+                    if (std::is_same<OP, ngraph::op::QuantizedDot>())
+                    {
+                        auto scale_val = extract_scale_value<OP>(node, 2);
+                        return build_quantized_inner_product_forward(
+                            data_desc, weights_desc, result_desc, scale_val[0], ops);
+                    }
+                    else if (std::is_same<OP, ngraph::op::QuantizedDotBias>())
+                    {
+                        auto scale_val = extract_scale_value<OP>(node, 3);
+                        auto bias_desc = mkldnn_utils::get_input_mkldnn_md(node, 2);
+                        return build_quantized_inner_product_forward(
+                            data_desc, weights_desc, bias_desc, result_desc, scale_val[0], ops);
+                    }
+                    else
+                    {
+                        throw ngraph_error("unsupported inner_product");
+                    }
+                }
                mkldnn::memory::format query_convolution_forward_weight_format(
                    const mkldnn::memory::desc& input_data_desc,
                    const mkldnn::memory::desc& weights_desc_any,
@@ -961,11 +1028,13 @@ namespace ngraph
                {
                    size_t index = 0;
                    if (std::is_same<OP, ngraph::op::QuantizedConvolution>() ||
+                        std::is_same<OP, ngraph::op::QuantizedDot>() ||
                        std::is_same<OP, ngraph::op::QuantizedConvolutionRelu>())
                    {
                        index = 2;
                    }
-                    else if (std::is_same<OP, ngraph::op::QuantizedConvolutionBias>())
+                    else if (std::is_same<OP, ngraph::op::QuantizedConvolutionBias>() ||
+                             std::is_same<OP, ngraph::op::QuantizedDotBias>())
                    {
                        index = 3;
                    }
@@ -1024,6 +1093,7 @@ namespace ngraph
                        std::is_same<OP, ngraph::op::QuantizedConvolutionBias>() ||
                        std::is_same<OP, ngraph::op::QuantizedConvolutionBiasAdd>() ||
                        std::is_same<OP, ngraph::op::QuantizedConvolutionBiasSignedAdd>() ||
+                        std::is_same<OP, ngraph::op::QuantizedDotBias>() ||
                        std::is_same<OP, ngraph::op::GroupConvolutionBias>())
                    {
                        return true;
@@ -1051,6 +1121,20 @@ namespace ngraph
                    }
                }
+                template <typename OP>
+                bool is_quantized_inner_product()
+                {
+                    if (std::is_same<OP, ngraph::op::QuantizedDot>() ||
+                        std::is_same<OP, ngraph::op::QuantizedDotBias>())
+                    {
+                        return true;
+                    }
+                    else
+                    {
+                        return false;
+                    }
+                }
                template <typename OP>
                mkldnn::rnn_forward::desc
                    get_rnn_forward_desc(const ngraph::Node* node,
@@ -1250,6 +1334,7 @@ namespace ngraph
                }
                size_t convolution_forward_init(bool with_bias = false);
+                size_t inner_product_forward_init(bool with_bias = false);
                template <bool with_bias>
                void build_convolution_forward(const mkldnn::convolution_forward::desc& desc,
@@ -1298,6 +1383,106 @@ namespace ngraph
                    m_mkldnn_primitives[conv_idx] = prim;
                }
+                template <typename OP>
+                mkldnn::inner_product_forward::desc
+                    get_inner_product_forward_desc(const ngraph::Node* node)
+                {
+                    auto data_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
+                    auto weights_desc = mkldnn_utils::get_input_mkldnn_md(node, 1);
+                    // MKLDNN relies on named formats for kernel selection
+                    if (weights_desc.data.format == mkldnn_nchw)
+                        weights_desc.data.format = mkldnn_oihw;
+                    if (weights_desc.data.format == mkldnn_ncdhw)
+                        weights_desc.data.format = mkldnn_oidhw;
+                    auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
+                    if (has_bias<OP>())
+                    {
+                        auto bias_desc = mkldnn_utils::get_input_mkldnn_md(node, 2);
+                        return mkldnn::inner_product_forward::desc(mkldnn::prop_kind::forward,
+                                                                   data_desc,
+                                                                   weights_desc,
+                                                                   bias_desc,
+                                                                   result_desc);
+                    }
+                    else
+                    {
+                        return mkldnn::inner_product_forward::desc(
+                            mkldnn::prop_kind::forward, data_desc, weights_desc, result_desc);
+                    }
+                }
+                template <typename OP>
+                mkldnn::primitive_attr get_inner_product_forward_attr(const ngraph::Node* node)
+                {
+                    mkldnn::post_ops ops;
+                    if (has_relu<OP>(node))
+                    {
+                        const float ops_scale = 1.f;
+                        const float ops_alpha = -0.f; // relu negative slope
+                        const float ops_beta = 0.f;
+                        ops.append_eltwise(
+                            ops_scale, mkldnn::algorithm::eltwise_relu, ops_alpha, ops_beta);
+                    }
+                    mkldnn::primitive_attr ip_attr;
+                    ip_attr.set_post_ops(ops);
+                    if (is_quantized_inner_product<OP>())
+                    {
+                        ip_attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
+                        ip_attr.set_output_scales(0, get_output_scale<OP, float>(node));
+                    }
+                    return ip_attr;
+                }
+                template <bool with_bias>
+                void build_inner_product_forward(const mkldnn::inner_product_forward::desc& desc,
+                                                 const mkldnn::primitive_attr& attr,
+                                                 const mkldnn::engine& engine,
+                                                 size_t ip_idx)
+                {
+                    size_t input_idx, weights_idx, results_idx, bias_idx;
+                    input_idx = m_primitive_deps[ip_idx][0];
+                    weights_idx = m_primitive_deps[ip_idx][1];
+                    m_mkldnn_primitives[input_idx] =
+                        new mkldnn::memory({{desc.data.src_desc}, engine}, nullptr);
+                    m_mkldnn_primitives[weights_idx] =
+                        new mkldnn::memory({{desc.data.weights_desc}, engine}, nullptr);
+                    if (with_bias)
+                    {
+                        bias_idx = m_primitive_deps[ip_idx][2];
+                        results_idx = m_primitive_deps[ip_idx][3];
+                        m_mkldnn_primitives[bias_idx] =
+                            new mkldnn::memory({{desc.data.bias_desc}, engine}, nullptr);
+                    }
+                    else
+                    {
+                        results_idx = m_primitive_deps[ip_idx][2];
+                    }
+                    m_mkldnn_primitives[results_idx] =
+                        new mkldnn::memory({{desc.data.dst_desc}, engine}, nullptr);
+                    mkldnn::primitive* prim;
+                    if (with_bias)
+                    {
+                        prim = new mkldnn::inner_product_forward({desc, attr, engine},
+                                                                 *m_mkldnn_primitives[input_idx],
+                                                                 *m_mkldnn_primitives[weights_idx],
+                                                                 *m_mkldnn_primitives[bias_idx],
+                                                                 *m_mkldnn_primitives[results_idx]);
+                    }
+                    else
+                    {
+                        prim = new mkldnn::inner_product_forward({desc, attr, engine},
+                                                                 *m_mkldnn_primitives[input_idx],
+                                                                 *m_mkldnn_primitives[weights_idx],
+                                                                 *m_mkldnn_primitives[results_idx]);
+                    }
+                    m_mkldnn_primitives[ip_idx] = prim;
+                }
                template <typename OP>
                mkldnn::convolution_backward_data::desc
                    get_convolution_backward_data_desc(const ngraph::Node* node)

--- a/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_assignment.cpp
@@ -38,6 +38,8 @@
 #include "ngraph/op/experimental/quantized_conv.hpp"
 #include "ngraph/op/experimental/quantized_conv_bias.hpp"
 #include "ngraph/op/experimental/quantized_conv_relu.hpp"
+#include "ngraph/op/experimental/quantized_dot.hpp"
+#include "ngraph/op/experimental/quantized_dot_bias.hpp"
 #include "ngraph/op/experimental/quantized_max_pool.hpp"
 #include "ngraph/op/get_output_element.hpp"
 #include "ngraph/op/lrn.hpp"
@@ -741,6 +743,18 @@ namespace ngraph
                    quantized_conv_bias->set_op_annotations(op_annotations);
                }
+                template <>
+                void CPUAssignment::ASSIGN_DECL(ngraph::op::QuantizedDotBias)
+                {
+                    runtime::cpu::mkldnn_utils::assign_mkldnn_kernel(node);
+                }
+                template <>
+                void CPUAssignment::ASSIGN_DECL(ngraph::op::QuantizedDot)
+                {
+                    runtime::cpu::mkldnn_utils::assign_mkldnn_kernel(node);
+                }
                template <>
                void CPUAssignment::ASSIGN_DECL(ngraph::op::Dequantize)
                {
@@ -931,6 +945,10 @@ static const runtime::cpu::pass::AssignOpMap s_dispatcher{
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Dequantize>},
    {TI(ngraph::op::QuantizedConcat),
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::QuantizedConcat>},
+    {TI(ngraph::op::QuantizedDot),
+     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::QuantizedDot>},
+    {TI(ngraph::op::QuantizedDotBias),
+     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::QuantizedDotBias>},
    {TI(ngraph::op::GetOutputElement),
     &runtime::cpu::pass::CPUAssignment::assign<ngraph::op::GetOutputElement>},
 };

--- a/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
+++ b/src/ngraph/runtime/cpu/pass/cpu_layout.cpp
@@ -39,6 +39,8 @@
 #include "ngraph/op/experimental/quantized_conv.hpp"
 #include "ngraph/op/experimental/quantized_conv_bias.hpp"
 #include "ngraph/op/experimental/quantized_conv_relu.hpp"
+#include "ngraph/op/experimental/quantized_dot.hpp"
+#include "ngraph/op/experimental/quantized_dot_bias.hpp"
 #include "ngraph/op/experimental/quantized_max_pool.hpp"
 #include "ngraph/op/get_output_element.hpp"
 #include "ngraph/op/lrn.hpp"
@@ -453,6 +455,82 @@ namespace ngraph
                    o_mds.push_back(prim_desc.dst_primitive_desc().desc());
                }
+                template <typename T, bool use_bias>
+                void InnerProductLayout(std::shared_ptr<ngraph::Node> node,
+                                        vector<memory::desc>& i_mds,
+                                        vector<memory::desc>& o_mds)
+                {
+                    auto arg0_shape = node->get_input_shape(0);
+                    auto arg1_shape = node->get_input_shape(1);
+                    auto result_shape = node->get_output_shape(0);
+                    memory::data_type et =
+                        mkldnn_utils::get_mkldnn_data_type(node->get_input_element_type(0));
+                    memory::data_type et_weights = runtime::cpu::mkldnn_utils::get_mkldnn_data_type(
+                        node->get_input_element_type(1));
+                    memory::data_type et_result = runtime::cpu::mkldnn_utils::get_mkldnn_data_type(
+                        node->get_output_element_type(0));
+                    engine cpu_engine(engine::cpu, 0);
+                    memory::dims mkldnn_arg0_shape(arg0_shape.begin(), arg0_shape.end());
+                    memory::dims mkldnn_arg1_shape(arg1_shape.begin(), arg1_shape.end());
+                    memory::dims mkldnn_result_shape(result_shape.begin(), result_shape.end());
+                    const memory::desc input_data_desc(mkldnn_arg0_shape, et, memory::format::any);
+                    const memory::desc weights_desc(
+                        mkldnn_arg1_shape, et_weights, memory::format::any);
+                    const memory::desc result_desc(
+                        mkldnn_result_shape, et_result, memory::format::any);
+                    std::unique_ptr<inner_product_forward::desc> fwd_desc{nullptr};
+                    if (use_bias)
+                    {
+                        memory::data_type et_bias =
+                            mkldnn_utils::get_mkldnn_data_type(node->get_input_element_type(2));
+                        auto arg2_shape = node->get_input_shape(2);
+                        memory::dims mkldnn_arg2_shape(arg2_shape.begin(), arg2_shape.end());
+                        const memory::desc bias_desc(
+                            mkldnn_arg2_shape, et_bias, memory::format::any);
+                        try
+                        {
+                            fwd_desc.reset(new inner_product_forward::desc(prop_kind::forward,
+                                                                           input_data_desc,
+                                                                           weights_desc,
+                                                                           bias_desc, // with bias
+                                                                           result_desc));
+                        }
+                        catch (const mkldnn::error& e)
+                        {
+                            throw ngraph_error(
+                                "setting layouts on inner_product failed with MKLDNN error: " +
+                                e.message);
+                        }
+                    }
+                    else
+                    {
+                        try
+                        {
+                            fwd_desc.reset(new inner_product_forward::desc(
+                                prop_kind::forward, input_data_desc, weights_desc, result_desc));
+                        }
+                        catch (const mkldnn::error& e)
+                        {
+                            throw ngraph_error(
+                                "setting layouts on inner_product failed with MKLDNN error: " +
+                                e.message);
+                        }
+                    }
+                    inner_product_forward::primitive_desc prim_desc(*fwd_desc, cpu_engine);
+                    i_mds.push_back(prim_desc.src_primitive_desc().desc());
+                    i_mds.push_back(prim_desc.weights_primitive_desc().desc());
+                    if (use_bias)
+                    {
+                        i_mds.push_back(prim_desc.bias_primitive_desc().desc());
+                    }
+                    o_mds.push_back(prim_desc.dst_primitive_desc().desc());
+                }
                template <>
                void CPULayout::LAYOUT_DECL(ngraph::op::QuantizedConvolution)
                {
@@ -628,6 +706,52 @@ namespace ngraph
                    }
                }
+                template <>
+                void CPULayout::LAYOUT_DECL(ngraph::op::QuantizedDotBias)
+                {
+                    if (mkldnn_utils::use_mkldnn_kernel(node.get()))
+                    {
+                        vector<memory::desc> i_mds;
+                        vector<memory::desc> o_mds;
+                        InnerProductLayout<ngraph::op::QuantizedDotBias, true>(node, i_mds, o_mds);
+                        auto scale_input_md = mkldnn_utils::create_default_mkldnn_md(
+                            node.get(), 3, false, memory::format::x);
+                        i_mds.push_back(scale_input_md);
+                        node = insert_input_conversions(external_function, node, i_mds);
+                        set_output_layouts(node, o_mds);
+                    }
+                    else
+                    {
+                        set_native_layouts(external_function, node);
+                    }
+                }
+                template <>
+                void CPULayout::LAYOUT_DECL(ngraph::op::QuantizedDot)
+                {
+                    if (mkldnn_utils::use_mkldnn_kernel(node.get()))
+                    {
+                        vector<memory::desc> i_mds;
+                        vector<memory::desc> o_mds;
+                        InnerProductLayout<ngraph::op::QuantizedDot, false>(node, i_mds, o_mds);
+                        auto scale_input_md = mkldnn_utils::create_default_mkldnn_md(
+                            node.get(), 2, false, memory::format::x);
+                        i_mds.push_back(scale_input_md);
+                        node = insert_input_conversions(external_function, node, i_mds);
+                        set_output_layouts(node, o_mds);
+                    }
+                    else
+                    {
+                        set_native_layouts(external_function, node);
+                    }
+                }
                template <>
                void CPULayout::LAYOUT_DECL(ngraph::op::ConvolutionRelu)
                {
@@ -2166,6 +2290,10 @@ static const runtime::cpu::pass::LayoutOpMap s_dispatcher{
     &runtime::cpu::pass::CPULayout::layout<ngraph::op::GroupConvolutionBias>},
    {TI(ngraph::op::QuantizedConcat),
     &runtime::cpu::pass::CPULayout::layout<ngraph::op::QuantizedConcat>},
+    {TI(ngraph::op::QuantizedDotBias),
+     &runtime::cpu::pass::CPULayout::layout<ngraph::op::QuantizedDotBias>},
+    {TI(ngraph::op::QuantizedDot),
+     &runtime::cpu::pass::CPULayout::layout<ngraph::op::QuantizedDot>},
 };
 bool runtime::cpu::pass::CPULayout::run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes)

--- a/src/ngraph/runtime/generic_cpu/gcpu_executable.hpp
+++ b/src/ngraph/runtime/generic_cpu/gcpu_executable.hpp
@@ -1007,6 +1007,8 @@ private:
        case OP_TYPEID::QuantizedConvolutionRelu:
        case OP_TYPEID::QuantizedConvolution:
        case OP_TYPEID::QuantizedMaxPool:
+        case OP_TYPEID::QuantizedDotBias:
+        case OP_TYPEID::QuantizedDot:
        {
            throw unsupported_op("Unsupported op '" + node.description() + "'.");
        }

--- a/src/ngraph/runtime/gpu/gpu_emitter.cpp
+++ b/src/ngraph/runtime/gpu/gpu_emitter.cpp
@@ -62,6 +62,8 @@
 #include "ngraph/op/experimental/quantized_conv.hpp"
 #include "ngraph/op/experimental/quantized_conv_bias.hpp"
 #include "ngraph/op/experimental/quantized_conv_relu.hpp"
+#include "ngraph/op/experimental/quantized_dot.hpp"
+#include "ngraph/op/experimental/quantized_dot_bias.hpp"
 #include "ngraph/op/experimental/quantized_max_pool.hpp"
 #include "ngraph/op/experimental/shape_of.hpp"
 #include "ngraph/op/floor.hpp"
@@ -921,6 +923,16 @@ std::string runtime::gpu::GPU_Emitter::emit_QuantizedConvolutionRelu(EMIT_ARGS)
    throw unsupported_op("Unsupported op '" + node->description() + "'");
 }
+std::string runtime::gpu::GPU_Emitter::emit_QuantizedDot(EMIT_ARGS)
+{
+    throw unsupported_op("Unsupported op '" + node->description() + "'");
+}
+std::string runtime::gpu::GPU_Emitter::emit_QuantizedDotBias(EMIT_ARGS)
+{
+    throw unsupported_op("Unsupported op '" + node->description() + "'");
+}
 std::string runtime::gpu::GPU_Emitter::emit_QuantizedMaxPool(EMIT_ARGS)
 {
    throw unsupported_op("Unsupported op '" + node->description() + "'");

--- a/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
+++ b/src/ngraph/runtime/intelgpu/intelgpu_backend.cpp
@@ -2003,6 +2003,8 @@ shared_ptr<runtime::Executable>
        case OP_TYPEID::QuantizedConvolutionBiasSignedAdd:
        case OP_TYPEID::QuantizedConvolutionRelu:
        case OP_TYPEID::QuantizedConvolution:
+        case OP_TYPEID::QuantizedDot:
+        case OP_TYPEID::QuantizedDotBias:
        case OP_TYPEID::QuantizedMaxPool:
        case OP_TYPEID::ReplaceSlice:
        case OP_TYPEID::GenerateMask:

--- a/src/ngraph/runtime/interpreter/int_executable.hpp
+++ b/src/ngraph/runtime/interpreter/int_executable.hpp
@@ -1019,6 +1019,8 @@ private:
        case OP_TYPEID::QuantizedConvolutionRelu:
        case OP_TYPEID::QuantizedConvolution:
        case OP_TYPEID::QuantizedMaxPool:
+        case OP_TYPEID::QuantizedDotBias:
+        case OP_TYPEID::QuantizedDot:
        {
            throw unsupported_op("Unsupported op '" + node.description() +
                                 "' in Interpreter back end.");

--- a/src/ngraph/serializer.cpp
+++ b/src/ngraph/serializer.cpp
@@ -52,6 +52,8 @@
 #include "ngraph/op/experimental/quantized_conv.hpp"
 #include "ngraph/op/experimental/quantized_conv_bias.hpp"
 #include "ngraph/op/experimental/quantized_conv_relu.hpp"
+#include "ngraph/op/experimental/quantized_dot.hpp"
+#include "ngraph/op/experimental/quantized_dot_bias.hpp"
 #include "ngraph/op/experimental/quantized_max_pool.hpp"
 #include "ngraph/op/experimental/shape_of.hpp"
 #include "ngraph/op/floor.hpp"
@@ -1030,6 +1032,10 @@ static shared_ptr<ngraph::Function>
                                                 data_dilation_strides.get<std::vector<size_t>>());
                break;
            }
+            case OP_TYPEID::QuantizedDotBias: { break;
+            }
+            case OP_TYPEID::QuantizedDot: { break;
+            }
            case OP_TYPEID::QuantizedMaxPool:
            {
                auto window_shape = node_js.at("window_shape").get<vector<size_t>>();
@@ -1645,6 +1651,10 @@ static json write(const Node& n, bool binary_constant_data)
        node["data_dilation_strides"] = tmp->get_data_dilation_strides();
        break;
    }
+    case OP_TYPEID::QuantizedDotBias: { break;
+    }
+    case OP_TYPEID::QuantizedDot: { break;
+    }
    case OP_TYPEID::QuantizedMaxPool:
    {
        auto tmp = dynamic_cast<const op::QuantizedMaxPool*>(&n);

--- a/test/builder_quantization.cpp
+++ b/test/builder_quantization.cpp
@@ -1181,3 +1181,207 @@ TEST(builder, scaled_quantize_concat_unsigned_varying)
    EXPECT_EQ((vector<uint8_t>{5, 1, 0, 0, 2, 4, 1, 3, 5, 1, 5, 100, 6, 8, 10, 7, 9, 50}),
              read_vector<uint8_t>(result));
 }
+// QuantizedDot
+TEST(builder, dynamic_scaled_QD)
+{
+    Shape shape_a{4, 3}; // input shape
+    vector<uint8_t> a_data = {209, 122, 39, 11, 33, 243, 250, 216, 159, 18, 181, 187};
+    Shape shape_b{3, 3}; // filter shape
+    vector<int8_t> b_data = {11, 15, 80, 50, -6, -3, -6, 78, 113};
+    Shape shape_r{4, 3}; // output shape
+    auto make_function = [shape_a, shape_b](bool requantize, bool with_relu) {
+        auto A = make_shared<op::Parameter>(element::u8, shape_a);
+        auto B = make_shared<op::Parameter>(element::i8, shape_b);
+        auto C = make_shared<op::Parameter>(element::f32, Shape{1});
+        auto D = make_shared<op::Parameter>(element::f32, Shape{1});
+        auto E = make_shared<op::Parameter>(element::f32, Shape{1});
+        auto F = make_shared<op::Parameter>(element::f32, Shape{1});
+        auto G = make_shared<op::Parameter>(element::f32, Shape{1});
+        auto H = make_shared<op::Parameter>(element::f32, Shape{1});
+        auto CV =
+            ngraph::builder::ScaledQuantizedDot(A, B, C, D, E, F, G, H, requantize, with_relu);
+        return make_shared<Function>(NodeVector{CV}, ParameterVector{A, B, C, D, E, F, G, H});
+    };
+    auto backend = runtime::Backend::create("CPU");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::u8, shape_a);
+    copy_data(a, a_data);
+    auto b = backend->create_tensor(element::i8, shape_b);
+    copy_data(b, b_data);
+    auto d = backend->create_tensor(element::f32, Shape{1});
+    copy_data(d, vector<float>{-127.0f});
+    auto e = backend->create_tensor(element::f32, Shape{1});
+    copy_data(e, vector<float>{127.0f});
+    auto e_a = backend->create_tensor(element::f32, Shape{1});
+    copy_data(e_a, vector<float>{0.1f});
+    auto g = backend->create_tensor(element::f32, Shape{1});
+    copy_data(g, vector<float>{0.9f});
+    auto h = backend->create_tensor(element::f32, Shape{1});
+    copy_data(h, vector<float>{37.618633f});
+    auto i = backend->create_tensor(element::f32, Shape{1});
+    copy_data(i, vector<float>{2.236754f});
+    // QuantizedDot (no requantize, no relu)
+    auto f_nrequantize = make_function(false, false);
+    auto f_nrequantize_r = backend->create_tensor(element::f32, shape_r);
+    auto f_nrequantize_handle = backend->compile(f_nrequantize);
+    f_nrequantize_handle->call_with_validate({f_nrequantize_r}, {a, b, d, e, e_a, g, h, i});
+    EXPECT_EQ((vector<float>{25.584705352783203,
+                             33.88588333129883,
+                             44.71411895751953,
+                             70.78588104248047,
+                             -1.3305882215499878,
+                             105.76588439941406,
+                             66.03529357910156,
+                             37.86000061035156,
+                             117.58235168457031,
+                             63.0811767578125,
+                             -2.6364705562591553,
+                             124.02706146240234}),
+              read_vector<float>(f_nrequantize_r));
+    // QuantizedDot with relu
+    auto f_nrequantize_relu = make_function(false, true);
+    auto f_nrequantize_relu_r = backend->create_tensor(element::f32, shape_r);
+    auto f_nrequantize_relu_handle = backend->compile(f_nrequantize_relu);
+    f_nrequantize_relu_handle->call_with_validate({f_nrequantize_relu_r},
+                                                  {a, b, d, e, e_a, g, h, i});
+    EXPECT_EQ((vector<float>{25.584705352783203,
+                             33.88588333129883,
+                             44.71411895751953,
+                             70.78588104248047,
+                             -0.0,
+                             105.76588439941406,
+                             66.03529357910156,
+                             37.86000061035156,
+                             117.58235168457031,
+                             63.0811767578125,
+                             -0.0,
+                             124.02706146240234}),
+              read_vector<float>(f_nrequantize_relu_r));
+    // QuantizedDot with requantize and no relu
+    auto f_requantize = make_function(true, false);
+    auto f_requantize_r = backend->create_tensor(element::i8, shape_r);
+    auto handle = backend->compile(f_requantize);
+    handle->call_with_validate({f_requantize_r}, {a, b, d, e, e_a, g, h, i});
+    EXPECT_EQ((vector<int8_t>{86, 114, 127, 127, -4, 127, 127, 127, 127, 127, -9, 127}),
+              read_vector<int8_t>(f_requantize_r));
+    // QuantizedDot with requantize and relu
+    auto f_requantize_relu = make_function(true, true);
+    auto f_requantize_relu_r = backend->create_tensor(element::u8, shape_r);
+    auto f_requantize_relu_handle = backend->compile(f_requantize_relu);
+    f_requantize_relu_handle->call_with_validate({f_requantize_relu_r}, {a, b, d, e, e_a, g, h, i});
+    EXPECT_EQ((vector<uint8_t>{173, 230, 255, 255, 0, 255, 255, 255, 255, 255, 0, 255}),
+              read_vector<uint8_t>(f_requantize_relu_r));
+}
+// QuantizedDotBias
+TEST(builder, dynamic_scaled_QD_with_bias)
+{
+    Shape shape_a{4, 3}; // input shape
+    vector<uint8_t> a_data = {209, 122, 39, 11, 33, 243, 250, 216, 159, 18, 181, 187};
+    Shape shape_b{3, 3}; // filter shape
+    vector<int8_t> b_data = {11, 15, 80, 50, -6, -3, -6, 78, 113};
+    Shape shape_c{3}; // bias shape
+    vector<int32_t> c_data = {192, 49, 23};
+    Shape shape_r{4, 3}; // output shape
+    auto make_function = [shape_a, shape_b, shape_c](bool requantize, bool with_relu) {
+        auto A = make_shared<op::Parameter>(element::u8, shape_a);
+        auto B = make_shared<op::Parameter>(element::i8, shape_b);
+        auto Bias = make_shared<op::Parameter>(element::i32, shape_c);
+        auto C = make_shared<op::Parameter>(element::f32, Shape{1});
+        auto D = make_shared<op::Parameter>(element::f32, Shape{1});
+        auto E = make_shared<op::Parameter>(element::f32, Shape{1});
+        auto F = make_shared<op::Parameter>(element::f32, Shape{1});
+        auto G = make_shared<op::Parameter>(element::f32, Shape{1});
+        auto H = make_shared<op::Parameter>(element::f32, Shape{1});
+        auto CV = ngraph::builder::ScaledQuantizedDotBias(
+            A, B, Bias, C, D, E, F, G, H, requantize, with_relu);
+        return make_shared<Function>(NodeVector{CV}, ParameterVector{A, B, Bias, C, D, E, F, G, H});
+    };
+    auto backend = runtime::Backend::create("CPU");
+    // Create some tensors for input/output
+    auto a = backend->create_tensor(element::u8, shape_a);
+    copy_data(a, a_data);
+    auto b = backend->create_tensor(element::i8, shape_b);
+    copy_data(b, b_data);
+    auto c = backend->create_tensor(element::i32, Shape{3});
+    copy_data(c, c_data);
+    auto d = backend->create_tensor(element::f32, Shape{1});
+    copy_data(d, vector<float>{-127.0f});
+    auto e = backend->create_tensor(element::f32, Shape{1});
+    copy_data(e, vector<float>{127.0f});
+    auto e_a = backend->create_tensor(element::f32, Shape{1});
+    copy_data(e_a, vector<float>{0.1f});
+    auto g = backend->create_tensor(element::f32, Shape{1});
+    copy_data(g, vector<float>{0.9f});
+    auto h = backend->create_tensor(element::f32, Shape{1});
+    copy_data(h, vector<float>{37.618633f});
+    auto i = backend->create_tensor(element::f32, Shape{1});
+    copy_data(i, vector<float>{2.236754f});
+    // QuantizedDotBias (no requantize, no relu)
+    auto f_nrequantize = make_function(false, false);
+    auto f_nrequantize_r = backend->create_tensor(element::f32, shape_r);
+    auto f_nrequantize_handle = backend->compile(f_nrequantize);
+    f_nrequantize_handle->call_with_validate({f_nrequantize_r}, {a, b, c, d, e, e_a, g, h, i});
+    EXPECT_EQ((vector<float>{26.262351989746094,
+                             34.05882263183594,
+                             44.79529571533203,
+                             71.46353149414062,
+                             -1.1576470136642456,
+                             105.84706115722656,
+                             66.71294403076172,
+                             38.03293991088867,
+                             117.66352844238281,
+                             63.75882339477539,
+                             -2.463529348373413,
+                             124.10823822021484}),
+              read_vector<float>(f_nrequantize_r));
+    // QuantizedDotBias with relu
+    auto f_nrequantize_relu = make_function(false, true);
+    auto f_nrequantize_relu_r = backend->create_tensor(element::f32, shape_r);
+    auto f_nrequantize_relu_handle = backend->compile(f_nrequantize_relu);
+    f_nrequantize_relu_handle->call_with_validate({f_nrequantize_relu_r},
+                                                  {a, b, c, d, e, e_a, g, h, i});
+    EXPECT_EQ((vector<float>{26.262351989746094,
+                             34.05882263183594,
+                             44.79529571533203,
+                             71.46353149414062,
+                             -0.0,
+                             105.84706115722656,
+                             66.71294403076172,
+                             38.03293991088867,
+                             117.66352844238281,
+                             63.75882339477539,
+                             -0.0,
+                             124.10823822021484}),
+              read_vector<float>(f_nrequantize_relu_r));
+    // QuantizedDotBias with requantize and no relu
+    auto f_requantize = make_function(true, false);
+    auto f_requantize_r = backend->create_tensor(element::i8, shape_r);
+    auto handle = backend->compile(f_requantize);
+    handle->call_with_validate({f_requantize_r}, {a, b, c, d, e, e_a, g, h, i});
+    EXPECT_EQ((vector<int8_t>{89, 115, 127, 127, -4, 127, 127, 127, 127, 127, -8, 127}),
+              read_vector<int8_t>(f_requantize_r));
+    // QuantizedDotBias with requantize and relu
+    auto f_requantize_relu = make_function(true, true);
+    auto f_requantize_relu_r = backend->create_tensor(element::u8, shape_r);
+    auto f_requantize_relu_handle = backend->compile(f_requantize_relu);
+    f_requantize_relu_handle->call_with_validate({f_requantize_relu_r},
+                                                 {a, b, c, d, e, e_a, g, h, i});
+    EXPECT_EQ((vector<uint8_t>{178, 231, 255, 255, 0, 255, 255, 255, 255, 255, 0, 255}),
+              read_vector<uint8_t>(f_requantize_relu_r));
+}