Commit 4dabd001 authored by Nishant Patel's avatar Nishant Patel Committed by Robert Kimball

Add Quantized conv+relu (#1664)

parent f8a084ac
......@@ -27,7 +27,6 @@ set(SRC
cpu_tensor_view.cpp
cpu_tracing.cpp
cpu_visualize_tree.cpp
quantization_util.cpp
builder/add.cpp
builder/allreduce.cpp
builder/avg_pool.cpp
......@@ -95,6 +94,7 @@ set(SRC
op/max_pool_with_indices.cpp
op/quantized_max_pool.cpp
op/quantized_avg_pool.cpp
op/quantized_conv_relu.cpp
op/rnn.cpp
op/sigmoid_mul.cpp
op/conv_add.cpp
......
......@@ -19,6 +19,7 @@
#include "ngraph/runtime/cpu/cpu_builder.hpp"
#include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
#include "ngraph/runtime/cpu/mkldnn_utils.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
using namespace std;
using namespace ngraph;
......@@ -67,7 +68,49 @@ namespace ngraph
throw ngraph_error("unsupported parameters for QuantizedConvolution via DEX");
}
}
template <>
void Builder::BUILDER_DECL(ngraph::op::QuantizedConvolutionRelu)
{
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
auto qconvolution_relu =
static_cast<const ngraph::op::QuantizedConvolutionRelu*>(node);
auto& functors = external_function->get_functors();
auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name());
auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
auto& out1_tensor = external_function->get_tensor_data(out[1].get_name());
auto& out2_tensor = external_function->get_tensor_data(out[2].get_name());
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto conv_index =
mkldnn_emitter->build_convolution<ngraph::op::QuantizedConvolutionRelu>(
node, args, out);
auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
float min_freezed_output = qconvolution_relu->get_freezed_output_min();
float max_freezed_output = qconvolution_relu->get_freezed_output_max();
auto functor = [&, conv_index, min_freezed_output, max_freezed_output](
CPURuntimeContext* ctx) {
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out_tensor);
*(static_cast<float*>(out1_tensor)) = min_freezed_output;
*(static_cast<float*>(out2_tensor)) = max_freezed_output;
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, conv_index);
};
functors.emplace_back(functor);
}
else
{
throw ngraph_error(
"unsupported parameters for QuantizedConvolutionRelu via DEX");
}
}
REGISTER_OP_BUILDER(QuantizedConvolution);
REGISTER_OP_BUILDER(QuantizedConvolutionRelu);
}
}
}
......@@ -109,6 +109,7 @@
#include "ngraph/runtime/cpu/op/matmul_bias.hpp"
#include "ngraph/runtime/cpu/op/max_pool_with_indices.hpp"
#include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
#include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
#include "ngraph/runtime/cpu/op/rnn.hpp"
#include "ngraph/runtime/cpu/op/sigmoid.hpp"
......@@ -2657,6 +2658,38 @@ namespace ngraph
}
}
template <>
void CPU_Emitter::EMITTER_DECL(ngraph::op::QuantizedConvolutionRelu)
{
auto qconvolution_relu =
static_cast<const ngraph::op::QuantizedConvolutionRelu*>(node);
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto conv_index =
mkldnn_emitter->build_convolution<ngraph::op::QuantizedConvolutionRelu>(
node, args, out);
auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0])
<< ", " << args[0].get_name() << ");\n";
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1])
<< ", " << args[1].get_name() << ");\n";
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[2])
<< ", " << out[0].get_name() << ");\n";
writer << "*(" << out[1].get_name()
<< ") = " << qconvolution_relu->get_freezed_output_min() << ";\n";
writer << "*(" << out[2].get_name()
<< ") = " << qconvolution_relu->get_freezed_output_max() << ";\n";
writer << "cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, "
<< to_string(conv_index) << ");\n";
}
else
{
throw ngraph_error("unsupported parameters for QuantizedConvolutionRelu");
}
}
template <>
void CPU_Emitter::EMITTER_DECL(ngraph::op::QuantizedConvolution)
{
......
......@@ -152,6 +152,7 @@
#include "ngraph/runtime/cpu/op/quantize.hpp"
#include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
#include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
#include "ngraph/runtime/cpu/op/rnn.hpp"
#include "ngraph/runtime/cpu/op/sigmoid.hpp"
......@@ -310,6 +311,8 @@ static const runtime::cpu::OpMap dispatcher{
{TI(ngraph::op::ConvolutionRelu), &runtime::cpu::CPU_Emitter::emit<op::ConvolutionRelu>},
{TI(ngraph::op::QuantizedConvolution),
&runtime::cpu::CPU_Emitter::emit<op::QuantizedConvolution>},
{TI(ngraph::op::QuantizedConvolutionRelu),
&runtime::cpu::CPU_Emitter::emit<op::QuantizedConvolutionRelu>},
{TI(ngraph::op::ConvolutionBiasAdd), &runtime::cpu::CPU_Emitter::emit<op::ConvolutionBiasAdd>},
// conv+bias backprop for data share the same implementation as ConvolutionBackpropData
{TI(ngraph::op::ConvolutionBiasBackpropFiltersBias),
......
......@@ -281,7 +281,8 @@ size_t MKLDNNEmitter::build_quantized_convolution(const mkldnn::memory::desc& in
const ngraph::Strides& dilation_strides,
const ngraph::CoordinateDiff& padding_below,
const ngraph::CoordinateDiff& padding_above,
const float scale)
const float scale,
const mkldnn::post_ops& pops)
{
size_t input_data_index = build_memory_primitive(input_data_desc);
size_t weights_index = build_memory_primitive(weights_desc);
......@@ -289,6 +290,7 @@ size_t MKLDNNEmitter::build_quantized_convolution(const mkldnn::memory::desc& in
std::vector<float> output_scale;
output_scale.push_back(scale);
mkldnn::primitive_attr conv_attr;
conv_attr.set_post_ops(pops);
/* Specify the rounding mode */
conv_attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
/* Specify the scales array and corresponding mask */
......
......@@ -33,6 +33,7 @@
#include "ngraph/runtime/cpu/op/conv_bias.hpp"
#include "ngraph/runtime/cpu/op/conv_relu.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
#include "ngraph/runtime/cpu/quantization_util.hpp"
#include "ngraph/shape.hpp"
#include "ngraph/strides.hpp"
......@@ -106,14 +107,16 @@ namespace ngraph
const ngraph::CoordinateDiff& padding_above,
const mkldnn::post_ops& pops = mkldnn::post_ops());
size_t build_quantized_convolution(const mkldnn::memory::desc& input_data_desc,
const mkldnn::memory::desc& weights_desc,
const mkldnn::memory::desc& result_desc,
const ngraph::Strides& strides,
const ngraph::Strides& dilation_strides,
const ngraph::CoordinateDiff& padding_below,
const ngraph::CoordinateDiff& padding_above,
const float scale);
size_t
build_quantized_convolution(const mkldnn::memory::desc& input_data_desc,
const mkldnn::memory::desc& weights_desc,
const mkldnn::memory::desc& result_desc,
const ngraph::Strides& strides,
const ngraph::Strides& dilation_strides,
const ngraph::CoordinateDiff& padding_below,
const ngraph::CoordinateDiff& padding_above,
const float scale,
const mkldnn::post_ops& pops = mkldnn::post_ops());
template <typename OP>
size_t build_convolution(const ngraph::Node* node,
......@@ -170,6 +173,10 @@ namespace ngraph
{
return true;
}
if (dynamic_cast<const ngraph::op::QuantizedConvolutionRelu*>(node))
{
return true;
}
return false;
};
......@@ -198,7 +205,24 @@ namespace ngraph
}
else if (std::is_same<OP, ngraph::op::QuantizedConvolution>())
{
const float scale = quantization_util::get_scale(node);
const float scale =
quantization_util::get_scale<ngraph::op::QuantizedConvolution>(node);
return build_quantized_convolution(
data_desc,
weights_desc,
result_desc,
convolution->get_window_movement_strides(),
window_dilation_strides_adjusted,
convolution->get_padding_below(),
convolution->get_padding_above(),
scale,
ops);
}
else if (std::is_same<OP, ngraph::op::QuantizedConvolutionRelu>())
{
const float scale =
quantization_util::get_scale<ngraph::op::QuantizedConvolutionRelu>(
node);
return build_quantized_convolution(
data_desc,
weights_desc,
......@@ -207,7 +231,8 @@ namespace ngraph
window_dilation_strides_adjusted,
convolution->get_padding_below(),
convolution->get_padding_above(),
scale);
scale,
ops);
}
else
{
......
/*******************************************************************************
* Copyright 2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include <numeric>
#include "ngraph/op/constant.hpp"
#include "ngraph/op/get_output_element.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
#include "ngraph/util.hpp"
using namespace std;
using namespace ngraph;
op::QuantizedConvolutionRelu::QuantizedConvolutionRelu(
const std::shared_ptr<op::QuantizedConvolution>& qconv)
: Op("QuantizedConvolutionRelu",
check_single_output_args({qconv->get_argument(0),
qconv->get_argument(1),
qconv->get_argument(2),
qconv->get_argument(3),
qconv->get_argument(4),
qconv->get_argument(5),
qconv->get_argument(6),
qconv->get_argument(7)}))
, m_window_movement_strides(qconv->get_window_movement_strides())
, m_window_dilation_strides(qconv->get_window_dilation_strides())
, m_padding_below(qconv->get_padding_below())
, m_padding_above(qconv->get_padding_above())
, m_data_dilation_strides(qconv->get_data_dilation_strides())
{
constructor_validate_and_infer_types();
this->m_input_min = qconv->get_input_min();
this->m_input_max = qconv->get_input_max();
this->m_filter_min = qconv->get_filter_min();
this->m_filter_max = qconv->get_filter_max();
this->m_freezed_output_min = qconv->get_freezed_output_min();
this->m_freezed_output_max = qconv->get_freezed_output_max();
set_output_size(3);
set_output_type(0, element::u8, qconv->get_shape());
set_output_type(1, element::f32, Shape{1});
set_output_type(2, element::f32, Shape{1});
}
op::QuantizedConvolutionRelu::QuantizedConvolutionRelu(
const std::shared_ptr<Node>& data_batch,
const std::shared_ptr<Node>& filters,
const Strides& window_movement_strides,
const Strides& window_dilation_strides,
const CoordinateDiff& padding_below,
const CoordinateDiff& padding_above,
const Strides& data_dilation_strides,
const std::shared_ptr<Node> min_input,
const std::shared_ptr<Node> max_input,
const std::shared_ptr<Node> min_filter,
const std::shared_ptr<Node> max_filter,
const std::shared_ptr<Node> min_freezed_output,
const std::shared_ptr<Node> max_freezed_output)
: Op("QuantizedConvolutionRelu",
check_single_output_args({data_batch,
filters,
min_input,
max_input,
min_filter,
max_filter,
min_freezed_output,
max_freezed_output}))
, m_window_movement_strides(window_movement_strides)
, m_window_dilation_strides(window_dilation_strides)
, m_padding_below(padding_below)
, m_padding_above(padding_above)
, m_data_dilation_strides(data_dilation_strides)
{
constructor_validate_and_infer_types();
auto& data_batch_shape = data_batch->get_shape();
auto& filters_shape = filters->get_shape();
auto min_input_const_op = std::static_pointer_cast<ngraph::op::Constant>(min_input);
auto max_input_const_op = std::static_pointer_cast<ngraph::op::Constant>(max_input);
auto min_filter_const_op = std::static_pointer_cast<ngraph::op::Constant>(min_filter);
auto max_filter_const_op = std::static_pointer_cast<ngraph::op::Constant>(max_filter);
auto min_freezed_output_const_op =
std::static_pointer_cast<ngraph::op::Constant>(min_freezed_output);
auto max_freezed_output_const_op =
std::static_pointer_cast<ngraph::op::Constant>(max_freezed_output);
float input_min = *(static_cast<float const*>(min_input_const_op->get_data_ptr()));
float input_max = *(static_cast<float const*>(max_input_const_op->get_data_ptr()));
float filter_min = *(static_cast<float const*>(min_filter_const_op->get_data_ptr()));
float filter_max = *(static_cast<float const*>(max_filter_const_op->get_data_ptr()));
float output_min = *(static_cast<float const*>(min_freezed_output_const_op->get_data_ptr()));
float output_max = *(static_cast<float const*>(max_freezed_output_const_op->get_data_ptr()));
this->m_input_min = input_min;
this->m_input_max = input_max;
this->m_filter_min = filter_min;
this->m_filter_max = filter_max;
this->m_freezed_output_min = output_min;
this->m_freezed_output_max = output_max;
set_output_size(3);
set_output_type(0,
element::u8,
util::infer_convolution_output_shape(this,
data_batch_shape,
filters_shape,
window_movement_strides,
window_dilation_strides,
padding_below,
padding_above,
data_dilation_strides,
0, /* batch_axis_data, */
1, /* input_channel_axis_data, */
1, /* input_channel_axis_filters, */
0, /* output_channel_axis_filters, */
0, /* batch_axis_result, */
1 /* output_channel_axis_result, */
));
set_output_type(1, element::f32, Shape{1});
set_output_type(2, element::f32, Shape{1});
}
std::shared_ptr<Node>
op::QuantizedConvolutionRelu::copy_with_new_args(const NodeVector& new_args) const
{
if (new_args.size() != 8)
{
throw ngraph_error("Incorrect number of new arguments");
}
return std::shared_ptr<Node>(new QuantizedConvolutionRelu(new_args.at(0),
new_args.at(1),
get_window_movement_strides(),
get_window_dilation_strides(),
get_padding_below(),
get_padding_above(),
get_data_dilation_strides(),
new_args.at(2),
new_args.at(3),
new_args.at(4),
new_args.at(5),
new_args.at(6),
new_args.at(7)));
}
//*****************************************************************************
// Copyright 2017-2018 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#pragma once
#include "ngraph/op/op.hpp"
#include "ngraph/runtime/cpu/op/conv_bias.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv.hpp"
namespace ngraph
{
namespace op
{
/// \brief Relu(Convolution) forward prop for batched convolution operation.
class QuantizedConvolutionRelu : public Op
{
public:
QuantizedConvolutionRelu(const std::shared_ptr<op::QuantizedConvolution>& qconv);
QuantizedConvolutionRelu(const std::shared_ptr<Node>& data_batch,
const std::shared_ptr<Node>& filters,
const Strides& window_movement_strides,
const Strides& window_dilation_strides,
const CoordinateDiff& padding_below,
const CoordinateDiff& padding_above,
const Strides& data_dilation_strides,
const std::shared_ptr<Node> min_input,
const std::shared_ptr<Node> max_input,
const std::shared_ptr<Node> min_filter,
const std::shared_ptr<Node> max_filter,
const std::shared_ptr<Node> min_freezed_output,
const std::shared_ptr<Node> max_freezed_output);
const Strides& get_window_movement_strides() const { return m_window_movement_strides; }
const Strides& get_window_dilation_strides() const { return m_window_dilation_strides; }
const CoordinateDiff& get_padding_below() const { return m_padding_below; }
const CoordinateDiff& get_padding_above() const { return m_padding_above; }
const Strides& get_data_dilation_strides() const { return m_data_dilation_strides; }
float get_input_min() const { return m_input_min; }
float get_input_max() const { return m_input_max; }
float get_filter_min() const { return m_filter_min; }
float get_filter_max() const { return m_filter_max; }
float get_freezed_output_min() const { return m_freezed_output_min; }
float get_freezed_output_max() const { return m_freezed_output_max; }
std::shared_ptr<Node> get_filters() { return get_argument(1); }
std::shared_ptr<Node> get_data_batch() { return get_argument(0); }
virtual std::shared_ptr<Node>
copy_with_new_args(const NodeVector& new_args) const override;
protected:
Strides m_window_movement_strides;
Strides m_window_dilation_strides;
CoordinateDiff m_padding_below;
CoordinateDiff m_padding_above;
Strides m_data_dilation_strides;
float m_input_min;
float m_input_max;
float m_filter_min;
float m_filter_max;
float m_freezed_output_min;
float m_freezed_output_max;
};
}
}
......@@ -50,6 +50,7 @@
#include "ngraph/runtime/cpu/op/quantize.hpp"
#include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
#include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
#include "ngraph/runtime/cpu/op/rnn.hpp"
#include "ngraph/runtime/cpu/op/sigmoid.hpp"
......@@ -759,6 +760,20 @@ namespace ngraph
}
}
template <>
void CPUAssignment::ASSIGN_DECL(ngraph::op::QuantizedConvolutionRelu)
{
if (node->get_input_element_type(0) == element::u8 &&
node->get_input_element_type(1) == element::i8)
{
auto quantized_conv_relu = static_cast<op::QuantizedConvolutionRelu*>(node);
auto op_annotations =
std::make_shared<ngraph::runtime::cpu::CPUOpAnnotations>();
op_annotations->set_mkldnn_op(true);
quantized_conv_relu->set_op_annotations(op_annotations);
}
}
template <>
void CPUAssignment::ASSIGN_DECL(ngraph::op::Quantize)
{
......@@ -838,6 +853,8 @@ static const runtime::cpu::pass::AssignOpMap s_dispatcher{
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::ConvolutionAdd>},
{TI(ngraph::op::Dequantize),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::Dequantize>},
{TI(ngraph::op::QuantizedConvolutionRelu),
&runtime::cpu::pass::CPUAssignment::assign<ngraph::op::QuantizedConvolutionRelu>},
};
bool runtime::cpu::pass::CPUAssignment::run_on_call_graph(
......
......@@ -58,6 +58,7 @@
#include "ngraph/runtime/cpu/op/quantize.hpp"
#include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
#include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
#include "ngraph/runtime/cpu/op/rnn.hpp"
......@@ -499,6 +500,51 @@ namespace ngraph
}
}
template <>
void CPULayout::LAYOUT_DECL(ngraph::op::QuantizedConvolutionRelu)
{
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node.get()))
{
vector<memory::desc> i_mds;
vector<memory::desc> o_mds;
ConvolutionLayout<ngraph::op::QuantizedConvolutionRelu, false, false>(
node, i_mds, o_mds);
auto min_input_md = mkldnn_utils::create_default_mkldnn_md(
node.get(), 2, false, memory::format::x);
auto max_input_md = mkldnn_utils::create_default_mkldnn_md(
node.get(), 3, false, memory::format::x);
auto min_filter_md = mkldnn_utils::create_default_mkldnn_md(
node.get(), 4, false, memory::format::x);
auto max_filter_md = mkldnn_utils::create_default_mkldnn_md(
node.get(), 5, false, memory::format::x);
auto min_freezed_output_md = mkldnn_utils::create_default_mkldnn_md(
node.get(), 6, false, memory::format::x);
auto max_freezed_output_md = mkldnn_utils::create_default_mkldnn_md(
node.get(), 7, false, memory::format::x);
auto min_output_md = mkldnn_utils::create_default_mkldnn_md(
node.get(), 1, true, memory::format::x);
auto max_output_md = mkldnn_utils::create_default_mkldnn_md(
node.get(), 2, true, memory::format::x);
i_mds.push_back(min_input_md);
i_mds.push_back(max_input_md);
i_mds.push_back(min_filter_md);
i_mds.push_back(max_filter_md);
i_mds.push_back(min_freezed_output_md);
i_mds.push_back(max_freezed_output_md);
o_mds.push_back(min_output_md);
o_mds.push_back(max_output_md);
node = insert_input_conversions(external_function, node, i_mds);
set_output_layouts(node, o_mds);
}
else
{
set_native_layouts(external_function, node);
}
}
template <>
void CPULayout::LAYOUT_DECL(ngraph::op::ConvolutionBiasAdd)
{
......@@ -1842,6 +1888,8 @@ static const runtime::cpu::pass::LayoutOpMap s_dispatcher{
{TI(ngraph::op::Dequantize), &runtime::cpu::pass::CPULayout::layout<ngraph::op::Dequantize>},
{TI(ngraph::op::Slice), &runtime::cpu::pass::CPULayout::layout<ngraph::op::Slice>},
{TI(ngraph::op::Quantize), &runtime::cpu::pass::CPULayout::layout<ngraph::op::Quantize>},
{TI(ngraph::op::QuantizedConvolutionRelu),
&runtime::cpu::pass::CPULayout::layout<ngraph::op::QuantizedConvolutionRelu>},
};
bool runtime::cpu::pass::CPULayout::run_on_call_graph(const std::list<std::shared_ptr<Node>>& nodes)
......
/*******************************************************************************
* Copyright 2018 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include "quantization_util.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv.hpp"
namespace ngraph
{
namespace runtime
{
namespace cpu
{
namespace quantization_util
{
float get_scale(const ngraph::Node* node)
{
auto qconvolution = static_cast<const ngraph::op::QuantizedConvolution*>(node);
float min_out_value;
float max_out_value;
quantization_range_for_multiplication<uint8_t, int8_t, int32_t>(
qconvolution->get_input_min(),
qconvolution->get_input_max(),
qconvolution->get_filter_min(),
qconvolution->get_filter_max(),
&min_out_value,
&max_out_value);
const float max_abs32 =
std::max(std::abs(min_out_value), std::abs(max_out_value));
const float max_abs8 =
std::max(std::abs(qconvolution->get_freezed_output_min()),
std::abs(qconvolution->get_freezed_output_max()));
// Output is signed int.
// s32 = f32 * std::pow(2, 31)/ max_abs32;
// s8 = f32 * std::pow(2, 7)/ max_abs8;
// s8 = s32 * std::pow(2, -24) * max_abs32 / max_abs8;
const float scale = static_cast<float>(
(std::pow(2, -24) * static_cast<double>(max_abs32 / max_abs8)));
return scale;
}
}
}
}
}
......@@ -89,7 +89,32 @@ namespace ngraph
quant_util.push_back(scale);
}
float get_scale(const ngraph::Node* node);
template <typename OP>
float get_scale(const ngraph::Node* node)
{
auto qconvolution = static_cast<const OP*>(node);
float min_out_value;
float max_out_value;
quantization_range_for_multiplication<uint8_t, int8_t, int32_t>(
qconvolution->get_input_min(),
qconvolution->get_input_max(),
qconvolution->get_filter_min(),
qconvolution->get_filter_max(),
&min_out_value,
&max_out_value);
const float max_abs32 =
std::max(std::abs(min_out_value), std::abs(max_out_value));
const float max_abs8 =
std::max(std::abs(qconvolution->get_freezed_output_min()),
std::abs(qconvolution->get_freezed_output_max()));
// Output is signed int.
// s32 = f32 * std::pow(2, 31)/ max_abs32;
// s8 = f32 * std::pow(2, 7)/ max_abs8;
// s8 = s32 * std::pow(2, -24) * max_abs32 / max_abs8;
const float scale = static_cast<float>(
(std::pow(2, -24) * static_cast<double>(max_abs32 / max_abs8)));
return scale;
}
}
}
}
......
......@@ -27,6 +27,7 @@
#include "ngraph/runtime/cpu/op/quantize.hpp"
#include "ngraph/runtime/cpu/op/quantized_avg_pool.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv.hpp"
#include "ngraph/runtime/cpu/op/quantized_conv_relu.hpp"
#include "ngraph/runtime/cpu/op/quantized_max_pool.hpp"
#include "util/all_close.hpp"
#include "util/all_close_f.hpp"
......@@ -331,3 +332,100 @@ TEST(quantize_cpu, quantize_to_int8)
EXPECT_EQ((vector<float>{-127}), read_vector<float>(result_min));
EXPECT_EQ((vector<float>{127}), read_vector<float>(result_max));
}
TEST(quantize_cpu, quantizedConv2D_with_relu)
{
Shape shape_a{1, 1, 3, 4}; // input shape
Shape shape_b{1, 1, 3, 3}; // filter shape
Shape shape_r{1, 1, 3, 4}; // output shape
vector<uint8_t> a_data = {1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4};
vector<int8_t> b_data = {1, 2, 3, 4, 5, 0, 0, 1, 2};
auto A = make_shared<op::Parameter>(element::u8, shape_a);
auto B = make_shared<op::Parameter>(element::i8, shape_b);
auto C = op::Constant::create(element::f32, Shape{1}, {0.0f});
auto D = op::Constant::create(element::f32, Shape{1}, {255.0f});
auto E = op::Constant::create(element::f32, Shape{1}, {-127.0f});
auto F = op::Constant::create(element::f32, Shape{1}, {127.0f});
auto G = op::Constant::create(element::f32, Shape{1}, {22.0f});
auto H = op::Constant::create(element::f32, Shape{1}, {90.0f});
auto CV = make_shared<op::QuantizedConvolutionRelu>(A,
B,
Strides{1, 1}, // move_strides
Strides{1, 1}, // filter_dilation
CoordinateDiff{1, 1}, // below_pads
CoordinateDiff{1, 1}, // above_pads
Strides{1, 1}, // data_dilation
C,
D,
E,
F,
G,
H);
auto output_data = std::make_shared<op::GetOutputElement>(CV, 0);
auto output_min = std::make_shared<op::GetOutputElement>(CV, 1);
auto output_max = std::make_shared<op::GetOutputElement>(CV, 2);
auto f = make_shared<Function>(NodeVector{output_data, output_min, output_max},
op::ParameterVector{A, B});
auto backend = runtime::Backend::create("CPU");
// Create some tensors for input/output
auto a = backend->create_tensor(element::u8, shape_a);
copy_data(a, a_data);
auto b = backend->create_tensor(element::i8, shape_b);
copy_data(b, b_data);
auto result = backend->create_tensor(element::u8, shape_r);
auto result_min = backend->create_tensor(element::f32, Shape{1});
auto result_max = backend->create_tensor(element::f32, Shape{1});
backend->call_with_validate(f, {result, result_min, result_max}, {a, b});
EXPECT_EQ((vector<uint8_t>{31, 48, 42, 45, 54, 102, 127, 61, 47, 74, 61, 55}),
read_vector<uint8_t>(result));
EXPECT_EQ((vector<float>{22.0}), read_vector<float>(result_min));
EXPECT_EQ((vector<float>{90.0}), read_vector<float>(result_max));
}
TEST(quantize_cpu, quantizedConv2D_fused_relu)
{
Shape shape_a{1, 1, 3, 3}; // input shape
Shape shape_b{1, 1, 3, 3}; // filter shape
Shape shape_r{1, 1, 3, 3}; // output shape
vector<uint8_t> a_data = {1, 2, 3, 4, 5, 6, 7, 8, 9};
vector<int8_t> b_data = {1, 2, 1, 0, 0, 0, -1, -2, -1};
auto A = make_shared<op::Parameter>(element::u8, shape_a);
auto B = make_shared<op::Parameter>(element::i8, shape_b);
auto C = op::Constant::create(element::f32, Shape{1}, {0.0f});
auto D = op::Constant::create(element::f32, Shape{1}, {255.0f});
auto E = op::Constant::create(element::f32, Shape{1}, {-127.0f});
auto F = op::Constant::create(element::f32, Shape{1}, {127.0f});
auto G = op::Constant::create(element::f32, Shape{1}, {20.0f});
auto H = op::Constant::create(element::f32, Shape{1}, {-24.0f});
auto CV = make_shared<op::QuantizedConvolutionRelu>(A,
B,
Strides{1, 1}, // move_strides
Strides{1, 1}, // filter_dilation
CoordinateDiff{1, 1}, // below_pads
CoordinateDiff{1, 1}, // above_pads
Strides{1, 1}, // data_dilation
C,
D,
E,
F,
G,
H);
auto output_data = std::make_shared<op::GetOutputElement>(CV, 0);
auto output_min = std::make_shared<op::GetOutputElement>(CV, 1);
auto output_max = std::make_shared<op::GetOutputElement>(CV, 2);
auto f = make_shared<Function>(NodeVector{output_data, output_min, output_max},
op::ParameterVector{A, B});
auto backend = runtime::Backend::create("CPU");
// Create some tensors for input/output
auto a = backend->create_tensor(element::u8, shape_a);
copy_data(a, a_data);
auto b = backend->create_tensor(element::i8, shape_b);
copy_data(b, b_data);
auto result = backend->create_tensor(element::u8, shape_r);
auto result_min = backend->create_tensor(element::f32, Shape{1});
auto result_max = backend->create_tensor(element::f32, Shape{1});
backend->call_with_validate(f, {result, result_min, result_max}, {a, b});
EXPECT_EQ((vector<uint8_t>{0, 0, 0, 0, 0, 0, 69, 106, 90}), read_vector<uint8_t>(result));
EXPECT_EQ((vector<float>{20.0}), read_vector<float>(result_min));
EXPECT_EQ((vector<float>{-24.0}), read_vector<float>(result_max));
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment