Commit 780b56bf authored by Nishant Patel's avatar Nishant Patel Committed by Robert Kimball

Add Quantized conv+bias (#1703)

* Add conv+bias

* Add test case for QuantizedConv2DWithBiasAndRelu and address feedback
parent a432b1a7
......@@ -95,6 +95,7 @@ set(SRC
op/quantized_max_pool.cpp
op/quantized_avg_pool.cpp
op/quantized_conv_relu.cpp
op/quantized_conv_bias.cpp
op/rnn.cpp
op/sigmoid_mul.cpp
op/conv_add.cpp
......
......@@ -19,6 +19,7 @@
#include "ngraph/runtime/cpu/cpu_builder.hpp"
#include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv_bias.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
using namespace std;
......@@ -39,7 +40,7 @@ namespace ngraph
auto& functors = external_function->get_functors();
auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name());
auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
auto& out0_tensor = external_function->get_tensor_data(out[0].get_name());
auto& out1_tensor = external_function->get_tensor_data(out[1].get_name());
auto& out2_tensor = external_function->get_tensor_data(out[2].get_name());
......@@ -56,7 +57,7 @@ namespace ngraph
CPURuntimeContext* ctx) {
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out0_tensor);
*(static_cast<float*>(out1_tensor)) = min_freezed_output;
*(static_cast<float*>(out2_tensor)) = max_freezed_output;
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, conv_index);
......@@ -79,7 +80,7 @@ namespace ngraph
auto& functors = external_function->get_functors();
auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name());
auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
auto& out0_tensor = external_function->get_tensor_data(out[0].get_name());
auto& out1_tensor = external_function->get_tensor_data(out[1].get_name());
auto& out2_tensor = external_function->get_tensor_data(out[2].get_name());
......@@ -96,7 +97,7 @@ namespace ngraph
CPURuntimeContext* ctx) {
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out0_tensor);
*(static_cast<float*>(out1_tensor)) = min_freezed_output;
*(static_cast<float*>(out2_tensor)) = max_freezed_output;
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, conv_index);
......@@ -109,8 +110,53 @@ namespace ngraph
"unsupported parameters for QuantizedConvolutionRelu via DEX");
}
}
template <>
void Builder::BUILDER_DECL(ngraph::op::QuantizedConvolutionBias)
{
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
auto qconvolution_bias =
static_cast<const ngraph::op::QuantizedConvolutionBias*>(node);
auto& functors = external_function->get_functors();
auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name());
auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name());
auto& out0_tensor = external_function->get_tensor_data(out[0].get_name());
auto& out1_tensor = external_function->get_tensor_data(out[1].get_name());
auto& out2_tensor = external_function->get_tensor_data(out[2].get_name());
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto conv_index =
mkldnn_emitter->build_convolution<ngraph::op::QuantizedConvolutionBias>(
node, args, out);
auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
float min_freezed_output = qconvolution_bias->get_freezed_output_min();
float max_freezed_output = qconvolution_bias->get_freezed_output_max();
auto functor = [&, conv_index, min_freezed_output, max_freezed_output](
CPURuntimeContext* ctx) {
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], arg2_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[3], out0_tensor);
*(static_cast<float*>(out1_tensor)) = min_freezed_output;
*(static_cast<float*>(out2_tensor)) = max_freezed_output;
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, conv_index);
};
functors.emplace_back(functor);
}
else
{
throw ngraph_error(
"unsupported parameters for QuantizedConvolutionBias via DEX");
}
}
REGISTER_OP_BUILDER(QuantizedConvolution);
REGISTER_OP_BUILDER(QuantizedConvolutionRelu);
REGISTER_OP_BUILDER(QuantizedConvolutionBias);
}
}
}
......@@ -109,6 +109,7 @@
#include "ngraph/runtime/cpu/op/matmul_bias.hpp"
#include "ngraph/runtime/cpu/op/max_pool_with_indices.hpp"
#include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv_bias.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
#include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
#include "ngraph/runtime/cpu/op/rnn.hpp"
......@@ -2990,6 +2991,42 @@ namespace ngraph
}
}
template <>
void CPU_Emitter::EMITTER_DECL(ngraph::op::QuantizedConvolutionBias)
{
auto qconvolution_bias =
static_cast<const ngraph::op::QuantizedConvolutionBias*>(node);
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto qconv_index =
mkldnn_emitter->build_convolution<ngraph::op::QuantizedConvolutionBias>(
node, args, out);
auto& deps = mkldnn_emitter->get_primitive_deps(qconv_index);
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0])
<< ", " << args[0].get_name() << ");\n";
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1])
<< ", " << args[1].get_name() << ");\n";
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[2])
<< ", " << args[2].get_name() << ");\n";
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[3])
<< ", " << out[0].get_name() << ");\n";
writer << "*(" << out[1].get_name()
<< ") = " << qconvolution_bias->get_freezed_output_min() << ";\n";
writer << "*(" << out[2].get_name()
<< ") = " << qconvolution_bias->get_freezed_output_max() << ";\n";
writer << "cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, "
<< to_string(qconv_index) << ");\n";
}
else
{
throw ngraph_error(
"QuantizedConvolutionBias is only supported with MKLDNN kernel.");
}
}
template <>
void CPU_Emitter::EMITTER_DECL(ngraph::op::ConvolutionBias)
{
......
......@@ -152,6 +152,7 @@
#include "ngraph/runtime/cpu/op/quantize.hpp"
#include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv_bias.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
#include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
#include "ngraph/runtime/cpu/op/rnn.hpp"
......@@ -308,6 +309,8 @@ static const runtime::cpu::OpMap dispatcher{
&runtime::cpu::CPU_Emitter::emit<op::ConvolutionBackpropData>},
{TI(ngraph::op::GroupConvolution), &runtime::cpu::CPU_Emitter::emit<op::GroupConvolution>},
{TI(ngraph::op::ConvolutionBias), &runtime::cpu::CPU_Emitter::emit<op::ConvolutionBias>},
{TI(ngraph::op::QuantizedConvolutionBias),
&runtime::cpu::CPU_Emitter::emit<op::QuantizedConvolutionBias>},
{TI(ngraph::op::ConvolutionRelu), &runtime::cpu::CPU_Emitter::emit<op::ConvolutionRelu>},
{TI(ngraph::op::QuantizedConvolution),
&runtime::cpu::CPU_Emitter::emit<op::QuantizedConvolution>},
......
......@@ -315,6 +315,63 @@ size_t MKLDNNEmitter::build_quantized_convolution(const mkldnn::memory::desc& in
return conv_index;
}
size_t MKLDNNEmitter::build_quantized_convolution(const mkldnn::memory::desc& input_data_desc,
const mkldnn::memory::desc& weights_desc,
const mkldnn::memory::desc& bias_desc,
const mkldnn::memory::desc& result_desc,
const ngraph::Strides& strides,
const ngraph::Strides& dilation_strides,
const ngraph::CoordinateDiff& padding_below,
const ngraph::CoordinateDiff& padding_above,
const float scale,
const mkldnn::post_ops& pops)
{
size_t input_data_index = build_memory_primitive(input_data_desc);
size_t weights_index = build_memory_primitive(weights_desc);
size_t bias_index = build_memory_primitive(bias_desc);
size_t result_index = build_memory_primitive(result_desc);
std::vector<float> output_scale;
output_scale.push_back(scale);
mkldnn::primitive_attr conv_attr;
conv_attr.set_post_ops(pops);
/* Specify the rounding mode */
conv_attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
/* Specify the scales array and corresponding mask */
conv_attr.set_output_scales(0, output_scale);
size_t conv_index = 0;
try
{
conv_index = insert_primitive(new mkldnn::convolution_forward(
{{mkldnn::prop_kind::forward,
mkldnn::algorithm::convolution_direct,
input_data_desc,
weights_desc,
bias_desc,
result_desc,
mkldnn::memory::dims(strides.begin(), strides.end()),
mkldnn::memory::dims(dilation_strides.begin(), dilation_strides.end()),
mkldnn::memory::dims(padding_below.begin(), padding_below.end()),
mkldnn::memory::dims(padding_above.begin(), padding_above.end()),
mkldnn::padding_kind::zero},
conv_attr,
mkldnn_utils::global_cpu_engine},
*m_mkldnn_primitives[input_data_index],
*m_mkldnn_primitives[weights_index],
*m_mkldnn_primitives[bias_index],
*m_mkldnn_primitives[result_index]));
m_primitive_deps[conv_index] = {input_data_index, weights_index, bias_index, result_index};
}
catch (const mkldnn::error& e)
{
throw ngraph_error("Could not create convolution " + e.message);
}
return conv_index;
}
size_t MKLDNNEmitter::build_convolution_forward(const mkldnn::memory::desc& input_data_desc,
const mkldnn::memory::desc& weights_desc,
const mkldnn::memory::desc& bias_desc,
......
......@@ -33,6 +33,7 @@
#include "ngraph/runtime/cpu/op/conv_bias.hpp"
#include "ngraph/runtime/cpu/op/conv_relu.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv_bias.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
#include "ngraph/runtime/cpu/quantization_util.hpp"
#include "ngraph/shape.hpp"
......@@ -118,6 +119,21 @@ namespace ngraph
const float scale,
const mkldnn::post_ops& pops = mkldnn::post_ops());
/**
* QuantizedConvolution + bias forward
*/
size_t
build_quantized_convolution(const mkldnn::memory::desc& input_data_desc,
const mkldnn::memory::desc& weights_desc,
const mkldnn::memory::desc& bias_desc,
const mkldnn::memory::desc& result_desc,
const ngraph::Strides& strides,
const ngraph::Strides& dilation_strides,
const ngraph::CoordinateDiff& padding_below,
const ngraph::CoordinateDiff& padding_above,
const float scale,
const mkldnn::post_ops& pops = mkldnn::post_ops());
template <typename OP>
size_t build_convolution(const ngraph::Node* node,
const std::vector<TensorViewWrapper>& args,
......@@ -177,6 +193,12 @@ namespace ngraph
{
return true;
}
if (dynamic_cast<const ngraph::op::QuantizedConvolutionBias*>(node))
{
return (dynamic_cast<const ngraph::op::QuantizedConvolutionBias*>(node))
->with_relu();
}
return false;
};
......@@ -234,6 +256,25 @@ namespace ngraph
scale,
ops);
}
else if (std::is_same<OP, ngraph::op::QuantizedConvolutionBias>())
{
// conv+bias = cvt_to_int8(scale*(dst + bias))
const float scale =
quantization_util::get_scale<ngraph::op::QuantizedConvolutionBias>(
node);
auto bias_desc = mkldnn_utils::get_input_mkldnn_md(node, 2);
return build_quantized_convolution(
data_desc,
weights_desc,
bias_desc,
result_desc,
convolution->get_window_movement_strides(),
window_dilation_strides_adjusted,
convolution->get_padding_below(),
convolution->get_padding_above(),
scale,
ops);
}
else
{
return build_convolution_forward(data_desc,
......
//*****************************************************************************
// Copyright 2017-2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <numeric>
#include "conv_bias.hpp"
#include "quantized_conv_bias.hpp"
#include "ngraph/op/constant.hpp"
#include "ngraph/op/get_output_element.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv.hpp"
#include "ngraph/util.hpp"
using namespace std;
using namespace ngraph;
op::QuantizedConvolutionBias::QuantizedConvolutionBias(
const shared_ptr<op::QuantizedConvolution>& qconv,
const shared_ptr<Node>& bias,
const bool with_relu)
: Op("QuantizedConvolutionBias",
check_single_output_args({qconv->get_argument(0),
qconv->get_argument(1),
bias,
qconv->get_argument(2),
qconv->get_argument(3),
qconv->get_argument(4),
qconv->get_argument(5),
qconv->get_argument(6),
qconv->get_argument(7)}))
, m_window_movement_strides(qconv->get_window_movement_strides())
, m_window_dilation_strides(qconv->get_window_dilation_strides())
, m_padding_below(qconv->get_padding_below())
, m_padding_above(qconv->get_padding_above())
, m_data_dilation_strides(qconv->get_data_dilation_strides())
, m_with_relu(with_relu)
{
constructor_validate_and_infer_types();
this->m_input_min = qconv->get_input_min();
this->m_input_max = qconv->get_input_max();
this->m_filter_min = qconv->get_filter_min();
this->m_filter_max = qconv->get_filter_max();
this->m_freezed_output_min = qconv->get_freezed_output_min();
this->m_freezed_output_max = qconv->get_freezed_output_max();
util::validate_convbias_shapes(qconv->get_argument(0)->get_shape(),
qconv->get_argument(1)->get_shape(),
bias->get_shape());
auto output_et = with_relu ? element::u8 : element::i8;
set_output_size(3);
set_output_type(0, output_et, qconv->get_shape());
set_output_type(1, element::f32, Shape{1});
set_output_type(2, element::f32, Shape{1});
}
op::QuantizedConvolutionBias::QuantizedConvolutionBias(
const shared_ptr<Node>& data_batch,
const shared_ptr<Node>& filters,
const shared_ptr<Node>& bias,
const Strides& window_movement_strides,
const Strides& window_dilation_strides,
const CoordinateDiff& padding_below,
const CoordinateDiff& padding_above,
const Strides& data_dilation_strides,
const std::shared_ptr<Node> min_input,
const std::shared_ptr<Node> max_input,
const std::shared_ptr<Node> min_filter,
const std::shared_ptr<Node> max_filter,
const std::shared_ptr<Node> min_freezed_output,
const std::shared_ptr<Node> max_freezed_output,
const bool with_relu)
: Op("QuantizedConvolutionBias",
check_single_output_args({data_batch,
filters,
bias,
min_input,
max_input,
min_filter,
max_filter,
min_freezed_output,
max_freezed_output}))
, m_window_movement_strides(window_movement_strides)
, m_window_dilation_strides(window_dilation_strides)
, m_padding_below(padding_below)
, m_padding_above(padding_above)
, m_data_dilation_strides(data_dilation_strides)
, m_with_relu(with_relu)
{
constructor_validate_and_infer_types();
auto& data_batch_shape = data_batch->get_shape();
auto& filters_shape = filters->get_shape();
auto min_input_const_op = std::static_pointer_cast<ngraph::op::Constant>(min_input);
auto max_input_const_op = std::static_pointer_cast<ngraph::op::Constant>(max_input);
auto min_filter_const_op = std::static_pointer_cast<ngraph::op::Constant>(min_filter);
auto max_filter_const_op = std::static_pointer_cast<ngraph::op::Constant>(max_filter);
auto min_freezed_output_const_op =
std::static_pointer_cast<ngraph::op::Constant>(min_freezed_output);
auto max_freezed_output_const_op =
std::static_pointer_cast<ngraph::op::Constant>(max_freezed_output);
float input_min = *(static_cast<float const*>(min_input_const_op->get_data_ptr()));
float input_max = *(static_cast<float const*>(max_input_const_op->get_data_ptr()));
float filter_min = *(static_cast<float const*>(min_filter_const_op->get_data_ptr()));
float filter_max = *(static_cast<float const*>(max_filter_const_op->get_data_ptr()));
float output_min = *(static_cast<float const*>(min_freezed_output_const_op->get_data_ptr()));
float output_max = *(static_cast<float const*>(max_freezed_output_const_op->get_data_ptr()));
this->m_input_min = input_min;
this->m_input_max = input_max;
this->m_filter_min = filter_min;
this->m_filter_max = filter_max;
this->m_freezed_output_min = output_min;
this->m_freezed_output_max = output_max;
util::validate_convbias_shapes(data_batch_shape, filters_shape, bias->get_shape());
auto output_et = with_relu ? element::u8 : element::i8;
set_output_size(3);
set_output_type(0,
output_et,
util::infer_convolution_output_shape(this,
data_batch_shape,
filters_shape,
window_movement_strides,
window_dilation_strides,
padding_below,
padding_above,
data_dilation_strides,
0, /* batch_axis_data, */
1, /* input_channel_axis_data, */
1, /* input_channel_axis_filters, */
0, /* output_channel_axis_filters, */
0, /* batch_axis_result, */
1 /* output_channel_axis_result, */
));
set_output_type(1, element::f32, Shape{1});
set_output_type(2, element::f32, Shape{1});
}
shared_ptr<Node> op::QuantizedConvolutionBias::copy_with_new_args(const NodeVector& new_args) const
{
if (new_args.size() != 9)
{
throw ngraph_error("Incorrect number of new arguments");
}
return shared_ptr<Node>(new QuantizedConvolutionBias(new_args.at(0),
new_args.at(1),
new_args.at(2),
get_window_movement_strides(),
get_window_dilation_strides(),
get_padding_below(),
get_padding_above(),
get_data_dilation_strides(),
new_args.at(3),
new_args.at(4),
new_args.at(5),
new_args.at(6),
new_args.at(7),
new_args.at(8),
m_with_relu));
}
//*****************************************************************************
// Copyright 2017-2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include "ngraph/op/op.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv.hpp"
namespace ngraph
{
namespace op
{
/// \brief Convolution + bias forward prop for batched convolution operation.
class QuantizedConvolutionBias : public Op
{
public:
QuantizedConvolutionBias(const std::shared_ptr<op::QuantizedConvolution>& qconv,
const std::shared_ptr<Node>& bias,
const bool with_relu = false);
QuantizedConvolutionBias(const std::shared_ptr<Node>& data_batch,
const std::shared_ptr<Node>& filters,
const std::shared_ptr<Node>& bias,
const Strides& window_movement_strides,
const Strides& window_dilation_strides,
const CoordinateDiff& padding_below,
const CoordinateDiff& padding_above,
const Strides& data_dilation_strides,
const std::shared_ptr<Node> min_input,
const std::shared_ptr<Node> max_input,
const std::shared_ptr<Node> min_filter,
const std::shared_ptr<Node> max_filter,
const std::shared_ptr<Node> min_freezed_output,
const std::shared_ptr<Node> max_freezed_output,
const bool with_relu = false);
const Strides& get_window_movement_strides() const { return m_window_movement_strides; }
const Strides& get_window_dilation_strides() const { return m_window_dilation_strides; }
const CoordinateDiff& get_padding_below() const { return m_padding_below; }
const CoordinateDiff& get_padding_above() const { return m_padding_above; }
const Strides& get_data_dilation_strides() const { return m_data_dilation_strides; }
float get_input_min() const { return m_input_min; }
float get_input_max() const { return m_input_max; }
float get_filter_min() const { return m_filter_min; }
float get_filter_max() const { return m_filter_max; }
float get_freezed_output_min() const { return m_freezed_output_min; }
float get_freezed_output_max() const { return m_freezed_output_max; }
std::shared_ptr<Node> get_bias() { return get_argument(2); }
std::shared_ptr<Node> get_filters() { return get_argument(1); }
std::shared_ptr<Node> get_data_batch() { return get_argument(0); }
bool with_relu() const { return m_with_relu; }
virtual std::shared_ptr<Node>
copy_with_new_args(const NodeVector& new_args) const override;
protected:
Strides m_window_movement_strides;
Strides m_window_dilation_strides;
CoordinateDiff m_padding_below;
CoordinateDiff m_padding_above;
Strides m_data_dilation_strides;
bool m_with_relu;
float m_input_min;
float m_input_max;
float m_filter_min;
float m_filter_max;
float m_freezed_output_min;
float m_freezed_output_max;
};
}
}
......@@ -50,6 +50,7 @@
#include "ngraph/runtime/cpu/op/quantize.hpp"
#include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv_bias.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
#include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
#include "ngraph/runtime/cpu/op/rnn.hpp"
......@@ -786,6 +787,20 @@ namespace ngraph
quantize->set_op_annotations(op_annotations);
}
}
template <>
void CPUAssignment::ASSIGN_DECL(ngraph::op::QuantizedConvolutionBias)
{
if (node->get_input_element_type(0) == element::u8 &&
node->get_input_element_type(1) == element::i8)
{
auto quantized_conv_bias = static_cast<op::QuantizedConvolutionBias*>(node);
auto op_annotations =
std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
op_annotations->set_mkldnn_op(true);
quantized_conv_bias->set_op_annotations(op_annotations);
}
}
}
}
}
......@@ -856,6 +871,8 @@ static const runtime::cpu::pass::AssignOpMap s_dispatcher{
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::DequantizeCPU>},
{TI(ngraph::op::QuantizedConvolutionRelu),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::QuantizedConvolutionRelu>},
{TI(ngraph::op::QuantizedConvolutionBias),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::QuantizedConvolutionBias>},
};
bool runtime::cpu::pass::CPUAssignment::run_on_call_graph(
......
......@@ -58,6 +58,7 @@
#include "ngraph/runtime/cpu/op/quantize.hpp"
#include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv_bias.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
#include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
#include "ngraph/runtime/cpu/op/rnn.hpp"
......@@ -482,6 +483,51 @@ namespace ngraph
}
}
template <>
void CPULayout::LAYOUT_DECL(ngraph::op::QuantizedConvolutionBias)
{
if (mkldnn_utils::use_mkldnn_kernel(node.get()))
{
vector<memory::desc> i_mds;
vector<memory::desc> o_mds;
ConvolutionLayout<ngraph::op::QuantizedConvolutionBias, true, false>(
node, i_mds, o_mds);
auto min_input_md = mkldnn_utils::create_default_mkldnn_md(
node.get(), 3, false, memory::format::x);
auto max_input_md = mkldnn_utils::create_default_mkldnn_md(
node.get(), 4, false, memory::format::x);
auto min_filter_md = mkldnn_utils::create_default_mkldnn_md(
node.get(), 5, false, memory::format::x);
auto max_filter_md = mkldnn_utils::create_default_mkldnn_md(
node.get(), 6, false, memory::format::x);
auto min_freezed_output_md = mkldnn_utils::create_default_mkldnn_md(
node.get(), 7, false, memory::format::x);
auto max_freezed_output_md = mkldnn_utils::create_default_mkldnn_md(
node.get(), 8, false, memory::format::x);
auto min_output_md = mkldnn_utils::create_default_mkldnn_md(
node.get(), 1, true, memory::format::x);
auto max_output_md = mkldnn_utils::create_default_mkldnn_md(
node.get(), 2, true, memory::format::x);
i_mds.push_back(min_input_md);
i_mds.push_back(max_input_md);
i_mds.push_back(min_filter_md);
i_mds.push_back(max_filter_md);
i_mds.push_back(min_freezed_output_md);
i_mds.push_back(max_freezed_output_md);
o_mds.push_back(min_output_md);
o_mds.push_back(max_output_md);
node = insert_input_conversions(external_function, node, i_mds);
set_output_layouts(node, o_mds);
}
else
{
set_native_layouts(external_function, node);
}
}
template <>
void CPULayout::LAYOUT_DECL(ngraph::op::ConvolutionRelu)
{
......@@ -1891,6 +1937,8 @@ static const runtime::cpu::pass::LayoutOpMap s_dispatcher{
{TI(ngraph::op::QuantizeCPU), &runtime::cpu::pass::CPULayout::layout<ngraph::op::QuantizeCPU>},
{TI(ngraph::op::QuantizedConvolutionRelu),
&runtime::cpu::pass::CPULayout::layout<ngraph::op::QuantizedConvolutionRelu>},
{TI(ngraph::op::QuantizedConvolutionBias),
&runtime::cpu::pass::CPULayout::layout<ngraph::op::QuantizedConvolutionBias>},
};
bool runtime::cpu::pass::CPULayout::run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes)
......
......@@ -27,6 +27,7 @@
#include "ngraph/runtime/cpu/op/quantize.hpp"
#include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv_bias.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
#include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
#include "util/all_close.hpp"
......@@ -429,3 +430,111 @@ TEST(quantize_cpu, quantizedConv2D_fused_relu)
EXPECT_EQ((vector<float>{20.0}), read_vector<float>(result_min));
EXPECT_EQ((vector<float>{-24.0}), read_vector<float>(result_max));
}
TEST(quantize_cpu, quantizedConv2D_with_bias)
{
Shape shape_a{1, 1, 3, 4}; // input shape
Shape shape_b{1, 1, 3, 3}; // filter shape
Shape shape_r{1, 1, 3, 4}; // output shape
vector<uint8_t> a_data = {1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4};
vector<int8_t> b_data = {1, 2, 3, 4, 5, 0, 0, 1, 2};
vector<int32_t> c_data = {5};
auto A = make_shared<op::Parameter>(element::u8, shape_a);
auto B = make_shared<op::Parameter>(element::i8, shape_b);
auto Bias = make_shared<op::Parameter>(element::i32, Shape{1});
auto C = op::Constant::create(element::f32, Shape{1}, {0.0f});
auto D = op::Constant::create(element::f32, Shape{1}, {255.0f});
auto E = op::Constant::create(element::f32, Shape{1}, {-127.0f});
auto F = op::Constant::create(element::f32, Shape{1}, {127.0f});
auto G = op::Constant::create(element::f32, Shape{1}, {22.0f});
auto H = op::Constant::create(element::f32, Shape{1}, {90.0f});
auto CV = make_shared<op::QuantizedConvolutionBias>(A,
B,
Bias,
Strides{1, 1}, // move_strides
Strides{1, 1}, // filter_dilation
CoordinateDiff{1, 1}, // below_pads
CoordinateDiff{1, 1}, // above_pads
Strides{1, 1}, // data_dilation
C,
D,
E,
F,
G,
H);
auto output_data = std::make_shared<op::GetOutputElement>(CV, 0);
auto output_min = std::make_shared<op::GetOutputElement>(CV, 1);
auto output_max = std::make_shared<op::GetOutputElement>(CV, 2);
auto f = make_shared<Function>(NodeVector{output_data, output_min, output_max},
op::ParameterVector{A, B, Bias});
auto backend = runtime::Backend::create("CPU");
// Create some tensors for input/output
auto a = backend->create_tensor(element::u8, shape_a);
copy_data(a, a_data);
auto b = backend->create_tensor(element::i8, shape_b);
copy_data(b, b_data);
auto c = backend->create_tensor(element::i32, Shape{1});
copy_data(c, c_data);
auto result = backend->create_tensor(element::i8, shape_r);
auto result_min = backend->create_tensor(element::f32, Shape{1});
auto result_max = backend->create_tensor(element::f32, Shape{1});
backend->call_with_validate(f, {result, result_min, result_max}, {a, b, c});
EXPECT_EQ((vector<int8_t>{38, 55, 50, 52, 61, 109, 127, 68, 54, 81, 68, 62}),
read_vector<int8_t>(result));
EXPECT_EQ((vector<float>{22.0}), read_vector<float>(result_min));
EXPECT_EQ((vector<float>{90.0}), read_vector<float>(result_max));
}
TEST(quantize_cpu, quantizedConv2D_with_bias_and_relu)
{
Shape shape_a{1, 1, 3, 3}; // input shape
Shape shape_b{1, 1, 3, 3}; // filter shape
Shape shape_r{1, 1, 3, 3}; // output shape
vector<uint8_t> a_data = {1, 2, 3, 4, 5, 6, 7, 8, 9};
vector<int8_t> b_data = {1, 2, 1, 0, 0, 0, -1, -2, -1};
vector<int32_t> c_data = {5};
auto A = make_shared<op::Parameter>(element::u8, shape_a);
auto B = make_shared<op::Parameter>(element::i8, shape_b);
auto Bias = make_shared<op::Parameter>(element::i32, Shape{1});
auto C = op::Constant::create(element::f32, Shape{1}, {0.0f});
auto D = op::Constant::create(element::f32, Shape{1}, {255.0f});
auto E = op::Constant::create(element::f32, Shape{1}, {-127.0f});
auto F = op::Constant::create(element::f32, Shape{1}, {127.0f});
auto G = op::Constant::create(element::f32, Shape{1}, {20.0f});
auto H = op::Constant::create(element::f32, Shape{1}, {-24.0f});
auto CV = make_shared<op::QuantizedConvolutionBias>(A,
B,
Bias,
Strides{1, 1}, // move_strides
Strides{1, 1}, // filter_dilation
CoordinateDiff{1, 1}, // below_pads
CoordinateDiff{1, 1}, // above_pads
Strides{1, 1}, // data_dilation
C,
D,
E,
F,
G,
H,
true);
auto output_data = std::make_shared<op::GetOutputElement>(CV, 0);
auto output_min = std::make_shared<op::GetOutputElement>(CV, 1);
auto output_max = std::make_shared<op::GetOutputElement>(CV, 2);
auto f = make_shared<Function>(NodeVector{output_data, output_min, output_max},
op::ParameterVector{A, B, Bias});
auto backend = runtime::Backend::create("CPU");
// Create some tensors for input/output
auto a = backend->create_tensor(element::u8, shape_a);
copy_data(a, a_data);
auto b = backend->create_tensor(element::i8, shape_b);
copy_data(b, b_data);
auto c = backend->create_tensor(element::i32, Shape{1});
copy_data(c, c_data);
auto result = backend->create_tensor(element::u8, shape_r);
auto result_min = backend->create_tensor(element::f32, Shape{1});
auto result_max = backend->create_tensor(element::f32, Shape{1});
backend->call_with_validate(f, {result, result_min, result_max}, {a, b, c});
EXPECT_EQ((vector<uint8_t>{0, 0, 0, 0, 0, 0, 96, 133, 117}), read_vector<uint8_t>(result));
EXPECT_EQ((vector<float>{20.0}), read_vector<float>(result_min));
EXPECT_EQ((vector<float>{-24.0}), read_vector<float>(result_max));
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment