Commit 7e310e20 authored by Nishant Patel's avatar Nishant Patel Committed by Robert Kimball

Support dynamic scales for Qconv's and Dequantize (#2171)

* Support dynamic scales for Qconv's and Dequantize

* Remove constant folding

* add additional dynamic_quantize unittest

* add another mxnet quantize unittest

* add additional dynamic_dequantize tests

* fix shape error

* add dynamic signed_quantize unittest

* Pass correct scale

* Refactoring

* Added dynamic scale support for QCBA and QCBSA

* Refactor to create MKLDNN primitives on the first iteration

* remove stray code

* unused variables

* remove extraneous line
parent c153ea8a
...@@ -45,6 +45,7 @@ namespace ngraph ...@@ -45,6 +45,7 @@ namespace ngraph
const Strides& get_data_dilation_strides() const { return m_data_dilation_strides; } const Strides& get_data_dilation_strides() const { return m_data_dilation_strides; }
std::shared_ptr<Node> get_filters() { return get_argument(1); } std::shared_ptr<Node> get_filters() { return get_argument(1); }
std::shared_ptr<Node> get_data_batch() { return get_argument(0); } std::shared_ptr<Node> get_data_batch() { return get_argument(0); }
bool with_relu() const { return true; }
virtual std::shared_ptr<Node> virtual std::shared_ptr<Node>
copy_with_new_args(const NodeVector& new_args) const override; copy_with_new_args(const NodeVector& new_args) const override;
......
...@@ -52,6 +52,46 @@ namespace ngraph ...@@ -52,6 +52,46 @@ namespace ngraph
auto& mkldnn_emitter = external_function->get_mkldnn_emitter(); auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0); auto input_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0); auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
auto scale_const_op = std::dynamic_pointer_cast<ngraph::op::Constant>(
dequantize->get_argument(1));
std::vector<float> scales;
if (scale_const_op == nullptr)
{
auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
auto scales_size = shape_size(args[1].get_shape());
size_t dequantize_index =
mkldnn_emitter->build_dequantization(node, input_desc, result_desc);
auto& deps = mkldnn_emitter->get_primitive_deps(dequantize_index);
functor = [&, input_desc, result_desc, scales_size, dequantize_index](
CPURuntimeContext* ctx, CPUExecutionContext* ectx) {
// Create MKLDNN reorder primitive during the first iteration.
// Assumes the scales dont change for the duration of the graph
if (ctx->first_iteration)
{
mkldnn::primitive_attr attr;
vector<float> dyn_scales;
dyn_scales.assign(static_cast<float*>(arg1_tensor),
static_cast<float*>(arg1_tensor) + scales_size);
attr.set_output_scales(0, dyn_scales);
attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
auto reorder_desc = mkldnn::reorder::primitive_desc(
{input_desc, executor::global_cpu_engine},
{result_desc, executor::global_cpu_engine},
attr);
*ctx->mkldnn_primitives[dequantize_index] =
mkldnn::reorder(reorder_desc,
*ctx->mkldnn_primitives[deps[0]],
*ctx->mkldnn_primitives[deps[1]]);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], out_tensor);
cpu::mkldnn_utils::mkldnn_invoke_primitive(ctx, dequantize_index);
};
functors.emplace_back(functor);
}
else
{
size_t dequantize_index = size_t dequantize_index =
mkldnn_emitter->build_dequantization(node, input_desc, result_desc); mkldnn_emitter->build_dequantization(node, input_desc, result_desc);
auto& deps = mkldnn_emitter->get_primitive_deps(dequantize_index); auto& deps = mkldnn_emitter->get_primitive_deps(dequantize_index);
...@@ -63,6 +103,7 @@ namespace ngraph ...@@ -63,6 +103,7 @@ namespace ngraph
}; };
functors.emplace_back(functor); functors.emplace_back(functor);
} }
}
else else
{ {
auto& arg0_tensor = tensor_data[args[0].get_name()]; auto& arg0_tensor = tensor_data[args[0].get_name()];
...@@ -223,6 +264,7 @@ namespace ngraph ...@@ -223,6 +264,7 @@ namespace ngraph
vector<float> dyn_scales; vector<float> dyn_scales;
dyn_scales.assign(static_cast<float*>(arg1_tensor), dyn_scales.assign(static_cast<float*>(arg1_tensor),
static_cast<float*>(arg1_tensor) + scales_size); static_cast<float*>(arg1_tensor) + scales_size);
dyn_scales[0] = 1.0 / dyn_scales[0];
attr.set_output_scales(0, dyn_scales); attr.set_output_scales(0, dyn_scales);
attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest); attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
auto reorder_desc = mkldnn::reorder::primitive_desc( auto reorder_desc = mkldnn::reorder::primitive_desc(
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include "ngraph/op/experimental/quantized_conv_bias.hpp" #include "ngraph/op/experimental/quantized_conv_bias.hpp"
#include "ngraph/op/experimental/quantized_conv_relu.hpp" #include "ngraph/op/experimental/quantized_conv_relu.hpp"
#include "ngraph/runtime/cpu/cpu_builder.hpp" #include "ngraph/runtime/cpu/cpu_builder.hpp"
#include "ngraph/runtime/cpu/cpu_executor.hpp"
#include "ngraph/runtime/cpu/mkldnn_invoke.hpp" #include "ngraph/runtime/cpu/mkldnn_invoke.hpp"
#include "ngraph/runtime/cpu/mkldnn_utils.hpp" #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
...@@ -39,17 +40,35 @@ namespace ngraph ...@@ -39,17 +40,35 @@ namespace ngraph
auto& functors = external_function->get_functors(); auto& functors = external_function->get_functors();
auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name()); auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name());
auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name()); auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name());
auto& out0_tensor = external_function->get_tensor_data(out[0].get_name()); auto& out0_tensor = external_function->get_tensor_data(out[0].get_name());
auto& mkldnn_emitter = external_function->get_mkldnn_emitter(); auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto scales_size = shape_size(args[2].get_shape());
auto conv_index = auto conv_desc =
mkldnn_emitter->build_convolution<ngraph::op::QuantizedConvolution>( mkldnn_emitter
->get_convolution_forward_desc<ngraph::op::QuantizedConvolution>(
node, args, out); node, args, out);
auto conv_attr =
mkldnn_emitter
->get_convolution_forward_attr<ngraph::op::QuantizedConvolution>(node);
size_t conv_index = mkldnn_emitter->convolution_forward_init();
auto& deps = mkldnn_emitter->get_primitive_deps(conv_index); auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
auto functor = [&, conv_index](CPURuntimeContext* ctx, auto functor = [&, scales_size, conv_desc, conv_attr, deps, conv_index](
CPUExecutionContext* ectx) { CPURuntimeContext* ctx, CPUExecutionContext* ectx) mutable {
// Create MKLDNN convolution primitive during the first iteration.
// Assumes the scales dont change for the duration of the graph
if (ctx->first_iteration)
{
vector<float> dyn_scales;
dyn_scales.assign(static_cast<float*>(arg2_tensor),
static_cast<float*>(arg2_tensor) + scales_size);
conv_attr.set_output_scales(0, dyn_scales);
mkldnn_emitter->convolution_forward<false>(
conv_desc, conv_attr, executor::global_cpu_engine, conv_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out0_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out0_tensor);
...@@ -71,17 +90,34 @@ namespace ngraph ...@@ -71,17 +90,34 @@ namespace ngraph
auto& functors = external_function->get_functors(); auto& functors = external_function->get_functors();
auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name()); auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name());
auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name()); auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name());
auto& out0_tensor = external_function->get_tensor_data(out[0].get_name()); auto& out0_tensor = external_function->get_tensor_data(out[0].get_name());
auto& mkldnn_emitter = external_function->get_mkldnn_emitter(); auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto scales_size = shape_size(args[2].get_shape());
auto conv_index = auto conv_desc =
mkldnn_emitter->build_convolution<ngraph::op::QuantizedConvolutionRelu>( mkldnn_emitter
->get_convolution_forward_desc<ngraph::op::QuantizedConvolutionRelu>(
node, args, out); node, args, out);
auto conv_attr =
mkldnn_emitter
->get_convolution_forward_attr<ngraph::op::QuantizedConvolutionRelu>(
node);
size_t conv_index = mkldnn_emitter->convolution_forward_init();
auto& deps = mkldnn_emitter->get_primitive_deps(conv_index); auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
auto functor = [&, conv_index](CPURuntimeContext* ctx, auto functor = [&, scales_size, conv_desc, conv_attr, deps, conv_index](
CPUExecutionContext* ectx) { CPURuntimeContext* ctx, CPUExecutionContext* ectx) mutable {
if (ctx->first_iteration)
{
vector<float> dyn_scales;
dyn_scales.assign(static_cast<float*>(arg2_tensor),
static_cast<float*>(arg2_tensor) + scales_size);
conv_attr.set_output_scales(0, dyn_scales);
mkldnn_emitter->convolution_forward<false>(
conv_desc, conv_attr, executor::global_cpu_engine, conv_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out0_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out0_tensor);
...@@ -105,16 +141,34 @@ namespace ngraph ...@@ -105,16 +141,34 @@ namespace ngraph
auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name()); auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name());
auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name()); auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name()); auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name());
auto& arg3_tensor = external_function->get_tensor_data(args[3].get_name());
auto& out0_tensor = external_function->get_tensor_data(out[0].get_name()); auto& out0_tensor = external_function->get_tensor_data(out[0].get_name());
auto& mkldnn_emitter = external_function->get_mkldnn_emitter(); auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto conv_index = auto scales_size = shape_size(args[3].get_shape());
mkldnn_emitter->build_convolution<ngraph::op::QuantizedConvolutionBias>(
auto conv_desc =
mkldnn_emitter
->get_convolution_forward_desc<ngraph::op::QuantizedConvolutionBias>(
node, args, out); node, args, out);
auto conv_attr =
mkldnn_emitter
->get_convolution_forward_attr<ngraph::op::QuantizedConvolutionBias>(
node);
size_t conv_index = mkldnn_emitter->convolution_forward_init(true);
auto& deps = mkldnn_emitter->get_primitive_deps(conv_index); auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
auto functor = [&, conv_index](CPURuntimeContext* ctx, auto functor = [&, scales_size, conv_desc, conv_attr, deps, conv_index](
CPUExecutionContext* ectx) { CPURuntimeContext* ctx, CPUExecutionContext* ectx) mutable {
if (ctx->first_iteration)
{
vector<float> dyn_scales;
dyn_scales.assign(static_cast<float*>(arg3_tensor),
static_cast<float*>(arg3_tensor) + scales_size);
conv_attr.set_output_scales(0, dyn_scales);
mkldnn_emitter->convolution_forward<true>(
conv_desc, conv_attr, executor::global_cpu_engine, conv_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], arg2_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], arg2_tensor);
...@@ -139,16 +193,58 @@ namespace ngraph ...@@ -139,16 +193,58 @@ namespace ngraph
auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name()); auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name());
auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name()); auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name()); auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name());
auto& arg4_tensor = external_function->get_tensor_data(args[4].get_name());
auto& arg5_tensor = external_function->get_tensor_data(args[5].get_name());
auto& out0_tensor = external_function->get_tensor_data(out[0].get_name()); auto& out0_tensor = external_function->get_tensor_data(out[0].get_name());
auto& mkldnn_emitter = external_function->get_mkldnn_emitter(); auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto conv_index = auto scales_size = shape_size(args[4].get_shape());
mkldnn_emitter->build_convolution<ngraph::op::QuantizedConvolutionBiasAdd>( auto sum_scales_size = shape_size(args[5].get_shape());
auto conv_desc =
mkldnn_emitter
->get_convolution_forward_desc<ngraph::op::QuantizedConvolutionBiasAdd>(
node, args, out); node, args, out);
auto conv_attr =
mkldnn_emitter
->get_convolution_forward_attr<ngraph::op::QuantizedConvolutionBiasAdd>(
node);
size_t conv_index = mkldnn_emitter->convolution_forward_init(true);
auto& deps = mkldnn_emitter->get_primitive_deps(conv_index); auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
auto functor = [&, conv_index](CPURuntimeContext* ctx, auto functor =
CPUExecutionContext* ectx) { [&, scales_size, sum_scales_size, conv_desc, conv_attr, deps, conv_index](
CPURuntimeContext* ctx, CPUExecutionContext* ectx) mutable {
if (ctx->first_iteration)
{
vector<float> dyn_scales;
vector<float> dyn_post_op_scales;
dyn_scales.assign(static_cast<float*>(arg4_tensor),
static_cast<float*>(arg4_tensor) + scales_size);
dyn_post_op_scales.assign(static_cast<float*>(arg5_tensor),
static_cast<float*>(arg5_tensor) +
sum_scales_size);
auto old_pops = conv_attr.get_post_ops();
mkldnn::post_ops new_pops;
for (int i = 0; i < old_pops.len(); i++)
{
if (old_pops.kind(i) == mkldnn::primitive::kind::eltwise)
{
mkldnn::algorithm alg;
float scale, alpha, beta;
old_pops.get_params_eltwise(i, scale, alg, alpha, beta);
new_pops.append_eltwise(scale, alg, alpha, beta);
}
if (old_pops.kind(i) == mkldnn::primitive::kind::sum)
{
new_pops.append_sum(dyn_post_op_scales[0]);
}
}
conv_attr.set_output_scales(0, dyn_scales);
conv_attr.set_post_ops(new_pops);
mkldnn_emitter->convolution_forward<true>(
conv_desc, conv_attr, executor::global_cpu_engine, conv_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], arg2_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], arg2_tensor);
...@@ -173,17 +269,54 @@ namespace ngraph ...@@ -173,17 +269,54 @@ namespace ngraph
auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name()); auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name());
auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name()); auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name()); auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name());
auto& arg4_tensor = external_function->get_tensor_data(args[4].get_name());
auto& arg5_tensor = external_function->get_tensor_data(args[5].get_name());
auto& out0_tensor = external_function->get_tensor_data(out[0].get_name()); auto& out0_tensor = external_function->get_tensor_data(out[0].get_name());
auto& mkldnn_emitter = external_function->get_mkldnn_emitter(); auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto conv_index = auto scales_size = shape_size(args[4].get_shape());
mkldnn_emitter auto sum_scales_size = shape_size(args[5].get_shape());
->build_convolution<ngraph::op::QuantizedConvolutionBiasSignedAdd>(
node, args, out); auto conv_desc = mkldnn_emitter->get_convolution_forward_desc<
ngraph::op::QuantizedConvolutionBiasSignedAdd>(node, args, out);
auto conv_attr = mkldnn_emitter->get_convolution_forward_attr<
ngraph::op::QuantizedConvolutionBiasSignedAdd>(node);
size_t conv_index = mkldnn_emitter->convolution_forward_init(true);
auto& deps = mkldnn_emitter->get_primitive_deps(conv_index); auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
auto functor = [&, conv_index](CPURuntimeContext* ctx, auto functor =
CPUExecutionContext* ectx) { [&, scales_size, sum_scales_size, conv_desc, conv_attr, deps, conv_index](
CPURuntimeContext* ctx, CPUExecutionContext* ectx) mutable {
if (ctx->first_iteration)
{
vector<float> dyn_scales;
vector<float> dyn_post_op_scales;
dyn_scales.assign(static_cast<float*>(arg4_tensor),
static_cast<float*>(arg4_tensor) + scales_size);
dyn_post_op_scales.assign(static_cast<float*>(arg5_tensor),
static_cast<float*>(arg5_tensor) +
sum_scales_size);
auto old_pops = conv_attr.get_post_ops();
mkldnn::post_ops new_pops;
for (int i = 0; i < old_pops.len(); i++)
{
if (old_pops.kind(i) == mkldnn::primitive::kind::eltwise)
{
mkldnn::algorithm alg;
float scale, alpha, beta;
old_pops.get_params_eltwise(i, scale, alg, alpha, beta);
new_pops.append_eltwise(scale, alg, alpha, beta);
}
if (old_pops.kind(i) == mkldnn::primitive::kind::sum)
{
new_pops.append_sum(2 * dyn_post_op_scales[0]);
}
}
conv_attr.set_post_ops(new_pops);
conv_attr.set_output_scales(0, dyn_scales);
mkldnn_emitter->convolution_forward<true>(
conv_desc, conv_attr, executor::global_cpu_engine, conv_index);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], arg2_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], arg2_tensor);
......
...@@ -148,10 +148,14 @@ size_t MKLDNNEmitter::build_dequantization(const ngraph::Node* node, ...@@ -148,10 +148,14 @@ size_t MKLDNNEmitter::build_dequantization(const ngraph::Node* node,
{ {
auto dequantize = static_cast<const ngraph::op::Dequantize*>(node); auto dequantize = static_cast<const ngraph::op::Dequantize*>(node);
auto scale_const_op = auto scale_const_op =
std::static_pointer_cast<ngraph::op::Constant>(dequantize->get_argument(1)); std::dynamic_pointer_cast<ngraph::op::Constant>(dequantize->get_argument(1));
float scale = *(static_cast<float const*>(scale_const_op->get_data_ptr())); std::vector<float> scale = {1.0f};
if (scale_const_op != nullptr)
{
scale = scale_const_op->get_vector<float>();
}
std::vector<float> scales; std::vector<float> scales;
scales.push_back(scale); scales.push_back(scale[0]);
size_t dequantize_index = 0; size_t dequantize_index = 0;
dequantize_index = this->build_quantize_reorder(input_desc, result_desc, scales); dequantize_index = this->build_quantize_reorder(input_desc, result_desc, scales);
return dequantize_index; return dequantize_index;
...@@ -1203,3 +1207,21 @@ size_t MKLDNNEmitter::build_bounded_relu(const mkldnn::memory::desc& input_desc, ...@@ -1203,3 +1207,21 @@ size_t MKLDNNEmitter::build_bounded_relu(const mkldnn::memory::desc& input_desc,
m_primitive_deps[primitive_index] = {input_index, result_index}; m_primitive_deps[primitive_index] = {input_index, result_index};
return primitive_index; return primitive_index;
} }
size_t MKLDNNEmitter::convolution_forward_init(bool with_bias)
{
size_t size = m_mkldnn_primitives.size();
if (with_bias)
{
// Inputs, Weights, Bias, Results, Conv
m_mkldnn_primitives.resize(size + 5, nullptr);
m_primitive_deps[m_mkldnn_primitives.size() - 1] = {size, size + 1, size + 2, size + 3};
}
else
{
// Inputs, Weights, Results, Conv
m_mkldnn_primitives.resize(size + 4, nullptr);
m_primitive_deps[m_mkldnn_primitives.size() - 1] = {size, size + 1, size + 2};
}
return m_mkldnn_primitives.size() - 1;
}
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#include "ngraph/op/experimental/quantized_conv.hpp" #include "ngraph/op/experimental/quantized_conv.hpp"
#include "ngraph/op/experimental/quantized_conv_bias.hpp" #include "ngraph/op/experimental/quantized_conv_bias.hpp"
#include "ngraph/op/experimental/quantized_conv_relu.hpp" #include "ngraph/op/experimental/quantized_conv_relu.hpp"
#include "ngraph/runtime/cpu/cpu_executor.hpp"
#include "ngraph/runtime/cpu/cpu_tensor_view_wrapper.hpp" #include "ngraph/runtime/cpu/cpu_tensor_view_wrapper.hpp"
#include "ngraph/runtime/cpu/mkldnn_utils.hpp" #include "ngraph/runtime/cpu/mkldnn_utils.hpp"
#include "ngraph/runtime/cpu/op/bounded_relu.hpp" #include "ngraph/runtime/cpu/op/bounded_relu.hpp"
...@@ -40,6 +41,8 @@ ...@@ -40,6 +41,8 @@
#include "ngraph/strides.hpp" #include "ngraph/strides.hpp"
#include "ngraph/type/element_type.hpp" #include "ngraph/type/element_type.hpp"
#define MKLDNN_DIMS(X) mkldnn::memory::dims(X.begin(), X.end())
namespace ngraph namespace ngraph
{ {
namespace runtime namespace runtime
...@@ -135,15 +138,15 @@ namespace ngraph ...@@ -135,15 +138,15 @@ namespace ngraph
template <typename OP> template <typename OP>
std::vector<float> extract_scale_value(const ngraph::Node* node, int index) std::vector<float> extract_scale_value(const ngraph::Node* node, int index)
{ {
auto qc = dynamic_cast<const OP*>(node); auto qc = static_cast<const OP*>(node);
std::vector<float> scale_val = {1.0f};
auto scale_const_op = auto scale_const_op =
std::dynamic_pointer_cast<ngraph::op::Constant>(qc->get_arguments()[index]); std::dynamic_pointer_cast<ngraph::op::Constant>(qc->get_arguments()[index]);
if (scale_const_op == nullptr) if (scale_const_op != nullptr)
{ {
throw ngraph_error("Scale must be a Constant"); scale_val = scale_const_op->template get_vector<float>();
} }
auto scale_val = scale_const_op->template get_vector<float>();
return scale_val; return scale_val;
} }
...@@ -197,53 +200,7 @@ namespace ngraph ...@@ -197,53 +200,7 @@ namespace ngraph
ops.append_sum(2.0 * sum_scale_val[0]); ops.append_sum(2.0 * sum_scale_val[0]);
} }
auto add_relu = [&]() { if (has_relu<OP>(node))
if (dynamic_cast<const ngraph::op::ConvolutionBias*>(node))
{
return (dynamic_cast<const ngraph::op::ConvolutionBias*>(node))
->with_relu();
}
if (dynamic_cast<const ngraph::op::ConvolutionBiasAdd*>(node))
{
return (dynamic_cast<const ngraph::op::ConvolutionBiasAdd*>(node))
->with_relu();
}
if (dynamic_cast<const ngraph::op::ConvolutionAdd*>(node))
{
return (dynamic_cast<const ngraph::op::ConvolutionAdd*>(node))
->with_relu();
}
if (dynamic_cast<const ngraph::op::ConvolutionRelu*>(node))
{
return true;
}
if (dynamic_cast<const ngraph::op::QuantizedConvolutionRelu*>(node))
{
return true;
}
if (dynamic_cast<const ngraph::op::QuantizedConvolutionBias*>(node))
{
return (dynamic_cast<const ngraph::op::QuantizedConvolutionBias*>(node))
->with_relu();
}
if (dynamic_cast<const ngraph::op::QuantizedConvolutionBiasAdd*>(node))
{
return (dynamic_cast<const ngraph::op::QuantizedConvolutionBiasAdd*>(
node))
->with_relu();
}
if (dynamic_cast<const ngraph::op::QuantizedConvolutionBiasSignedAdd*>(
node))
{
return (dynamic_cast<
const ngraph::op::QuantizedConvolutionBiasSignedAdd*>(node))
->with_relu();
}
return false;
};
if (add_relu())
{ {
const float ops_scale = 1.f; const float ops_scale = 1.f;
const float ops_alpha = -0.f; // relu negative slope const float ops_alpha = -0.f; // relu negative slope
...@@ -626,6 +583,244 @@ namespace ngraph ...@@ -626,6 +583,244 @@ namespace ngraph
size_t build_quantize_reorder(const mkldnn::memory::desc& input_desc, size_t build_quantize_reorder(const mkldnn::memory::desc& input_desc,
const mkldnn::memory::desc& result_desc, const mkldnn::memory::desc& result_desc,
const std::vector<float>& scales); const std::vector<float>& scales);
template <typename OP>
size_t get_scale_index()
{
if (std::is_same<OP, ngraph::op::QuantizedConvolution>() ||
std::is_same<OP, ngraph::op::QuantizedConvolutionRelu>())
{
return 2;
}
if (std::is_same<OP, ngraph::op::QuantizedConvolutionBias>())
{
return 3;
}
if (std::is_same<OP, ngraph::op::QuantizedConvolutionBiasAdd>() ||
std::is_same<OP, ngraph::op::QuantizedConvolutionBiasSignedAdd>())
{
return 4;
}
}
template <typename OP, typename T>
std::vector<T> get_output_scale(const ngraph::Node* node)
{
auto index = get_scale_index<OP>();
std::vector<T> scale_val = {0};
auto scale_const_op = std::dynamic_pointer_cast<ngraph::op::Constant>(
node->get_arguments()[index]);
if (scale_const_op != nullptr)
{
scale_val = scale_const_op->template get_vector<T>();
}
return scale_val;
}
template <typename OP,
typename std::enable_if<
(std::is_same<OP, ngraph::op::Convolution>::value ||
std::is_same<OP, ngraph::op::QuantizedConvolution>::value),
std::nullptr_t>::type = nullptr>
bool has_relu(const ngraph::Node* node)
{
return false;
}
template <typename OP,
typename std::enable_if<
(!std::is_same<OP, ngraph::op::Convolution>::value &&
!std::is_same<OP, ngraph::op::QuantizedConvolution>::value),
std::nullptr_t>::type = nullptr>
bool has_relu(const ngraph::Node* node)
{
return static_cast<const OP*>(node)->with_relu();
}
template <typename OP>
bool has_bias()
{
if (std::is_same<OP, ngraph::op::ConvolutionBias>() ||
std::is_same<OP, ngraph::op::ConvolutionBiasAdd>() ||
std::is_same<OP, ngraph::op::QuantizedConvolutionBias>() ||
std::is_same<OP, ngraph::op::QuantizedConvolutionBiasAdd>() ||
std::is_same<OP, ngraph::op::QuantizedConvolutionBiasSignedAdd>())
{
return true;
}
else
{
return false;
}
}
template <typename OP>
bool is_quantized_conv()
{
if (std::is_same<OP, ngraph::op::QuantizedConvolution>() ||
std::is_same<OP, ngraph::op::QuantizedConvolutionRelu>() ||
std::is_same<OP, ngraph::op::QuantizedConvolutionBias>() ||
std::is_same<OP, ngraph::op::QuantizedConvolutionBiasAdd>() ||
std::is_same<OP, ngraph::op::QuantizedConvolutionBiasSignedAdd>())
{
return true;
}
else
{
return false;
}
}
template <typename OP>
mkldnn::convolution_forward::desc
get_convolution_forward_desc(const ngraph::Node* node,
const std::vector<TensorViewWrapper>& args,
const std::vector<TensorViewWrapper>& out)
{
auto convolution = static_cast<const OP*>(node);
// For dilation, MKLDNN wants to know how many elements to insert between, not how far
// apart to space the elements like nGraph. So we have to subtract 1 from each pos.
Strides window_dilation_strides_adjusted;
for (size_t s : convolution->get_window_dilation_strides())
{
window_dilation_strides_adjusted.push_back(s - 1);
}
auto data_desc = mkldnn_utils::get_input_mkldnn_md(node, 0);
auto weights_desc = mkldnn_utils::get_input_mkldnn_md(node, 1);
// MKLDNN relies on named formats for kernel selection
if (weights_desc.data.format == mkldnn_nchw)
weights_desc.data.format = mkldnn_oihw;
if (weights_desc.data.format == mkldnn_ncdhw)
weights_desc.data.format = mkldnn_oidhw;
auto result_desc = mkldnn_utils::get_output_mkldnn_md(node, 0);
if (has_bias<OP>())
{
auto bias_desc = mkldnn_utils::get_input_mkldnn_md(node, 2);
return mkldnn::convolution_forward::desc(
mkldnn::prop_kind::forward,
mkldnn::algorithm::convolution_direct,
data_desc,
weights_desc,
bias_desc,
result_desc,
MKLDNN_DIMS(convolution->get_window_movement_strides()),
MKLDNN_DIMS(window_dilation_strides_adjusted),
MKLDNN_DIMS(convolution->get_padding_below()),
MKLDNN_DIMS(convolution->get_padding_above()),
mkldnn::padding_kind::zero);
}
else
{
return mkldnn::convolution_forward::desc(
mkldnn::prop_kind::forward,
mkldnn::algorithm::convolution_direct,
data_desc,
weights_desc,
result_desc,
MKLDNN_DIMS(convolution->get_window_movement_strides()),
MKLDNN_DIMS(window_dilation_strides_adjusted),
MKLDNN_DIMS(convolution->get_padding_below()),
MKLDNN_DIMS(convolution->get_padding_above()),
mkldnn::padding_kind::zero);
}
}
template <typename OP>
mkldnn::primitive_attr get_convolution_forward_attr(const ngraph::Node* node)
{
mkldnn::post_ops ops;
if (std::is_same<OP, ngraph::op::ConvolutionBiasAdd>() ||
std::is_same<OP, ngraph::op::ConvolutionAdd>())
{
ops.append_sum(1.f);
}
if (std::is_same<OP, ngraph::op::QuantizedConvolutionBiasAdd>())
{
auto sum_scale_val =
extract_scale_value<ngraph::op::QuantizedConvolutionBiasAdd>(node, 5);
ops.append_sum(sum_scale_val[0]);
}
if (std::is_same<OP, ngraph::op::QuantizedConvolutionBiasSignedAdd>())
{
auto sum_scale_val =
extract_scale_value<ngraph::op::QuantizedConvolutionBiasSignedAdd>(node,
5);
ops.append_sum(2.0 * sum_scale_val[0]);
}
if (has_relu<OP>(node))
{
const float ops_scale = 1.f;
const float ops_alpha = -0.f; // relu negative slope
const float ops_beta = 0.f;
ops.append_eltwise(
ops_scale, mkldnn::algorithm::eltwise_relu, ops_alpha, ops_beta);
}
mkldnn::primitive_attr conv_attr;
conv_attr.set_post_ops(ops);
if (is_quantized_conv<OP>())
{
conv_attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
conv_attr.set_output_scales(0, get_output_scale<OP, float>(node));
}
return conv_attr;
}
size_t convolution_forward_init(bool with_bias = false);
template <bool with_bias>
void convolution_forward(const mkldnn::convolution_forward::desc& desc,
const mkldnn::primitive_attr& attr,
const mkldnn::engine& engine,
size_t& conv_idx)
{
size_t input_idx, weights_idx, results_idx, bias_idx;
input_idx = m_primitive_deps[conv_idx][0];
weights_idx = m_primitive_deps[conv_idx][1];
m_mkldnn_primitives[input_idx] =
new mkldnn::memory({{desc.data.src_desc}, engine}, nullptr);
m_mkldnn_primitives[weights_idx] =
new mkldnn::memory({{desc.data.weights_desc}, engine}, nullptr);
if (with_bias)
{
bias_idx = m_primitive_deps[conv_idx][2];
results_idx = m_primitive_deps[conv_idx][3];
m_mkldnn_primitives[bias_idx] =
new mkldnn::memory({{desc.data.bias_desc}, engine}, nullptr);
}
else
{
results_idx = m_primitive_deps[conv_idx][2];
}
m_mkldnn_primitives[results_idx] =
new mkldnn::memory({{desc.data.dst_desc}, engine}, nullptr);
mkldnn::primitive* prim;
if (with_bias)
{
prim = new mkldnn::convolution_forward({desc, attr, engine},
*m_mkldnn_primitives[input_idx],
*m_mkldnn_primitives[weights_idx],
*m_mkldnn_primitives[bias_idx],
*m_mkldnn_primitives[results_idx]);
}
else
{
prim = new mkldnn::convolution_forward({desc, attr, engine},
*m_mkldnn_primitives[input_idx],
*m_mkldnn_primitives[weights_idx],
*m_mkldnn_primitives[results_idx]);
}
m_mkldnn_primitives[conv_idx] = prim;
}
private: private:
std::vector<mkldnn::primitive*> m_mkldnn_primitives; std::vector<mkldnn::primitive*> m_mkldnn_primitives;
......
...@@ -45,6 +45,7 @@ namespace ngraph ...@@ -45,6 +45,7 @@ namespace ngraph
const Strides& get_data_dilation_strides() const { return m_data_dilation_strides; } const Strides& get_data_dilation_strides() const { return m_data_dilation_strides; }
std::shared_ptr<Node> get_filters() { return get_argument(1); } std::shared_ptr<Node> get_filters() { return get_argument(1); }
std::shared_ptr<Node> get_data_batch() { return get_argument(0); } std::shared_ptr<Node> get_data_batch() { return get_argument(0); }
bool with_relu() const { return true; }
virtual std::shared_ptr<Node> virtual std::shared_ptr<Node>
copy_with_new_args(const NodeVector& new_args) const override; copy_with_new_args(const NodeVector& new_args) const override;
......
...@@ -187,6 +187,59 @@ TEST(builder, scaled_QC) ...@@ -187,6 +187,59 @@ TEST(builder, scaled_QC)
read_vector<int8_t>(result)); read_vector<int8_t>(result));
} }
TEST(builder, dynamic_scaled_QC)
{
Shape shape_a{1, 1, 3, 4}; // input shape
Shape shape_b{1, 1, 3, 3}; // filter shape
Shape shape_r{1, 1, 3, 4}; // output shape
vector<uint8_t> a_data = {1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4};
vector<int8_t> b_data = {1, 2, 3, 4, 5, 0, 0, 1, 2};
auto A = make_shared<op::Parameter>(element::u8, shape_a);
auto B = make_shared<op::Parameter>(element::i8, shape_b);
auto C = make_shared<op::Parameter>(element::f32, Shape{1});
auto D = make_shared<op::Parameter>(element::f32, Shape{1});
auto E = make_shared<op::Parameter>(element::f32, Shape{1});
auto F = make_shared<op::Parameter>(element::f32, Shape{1});
auto G = make_shared<op::Parameter>(element::f32, Shape{1});
auto H = make_shared<op::Parameter>(element::f32, Shape{1});
auto CV = ngraph::builder::ScaledQuantizedConvolution(A,
B,
Strides{1, 1}, // move_strides
Strides{1, 1}, // filter_dilation
CoordinateDiff{1, 1}, // below_pads
CoordinateDiff{1, 1}, // above_pads
Strides{1, 1}, // data_dilation
C,
D,
E,
F,
G,
H);
auto f = make_shared<Function>(NodeVector{CV}, ParameterVector{A, B, C, D, E, F, G, H});
auto backend = runtime::Backend::create("CPU");
// Create some tensors for input/output
auto a = backend->create_tensor(element::u8, shape_a);
copy_data(a, a_data);
auto b = backend->create_tensor(element::i8, shape_b);
copy_data(b, b_data);
auto d = backend->create_tensor(element::f32, Shape{1});
copy_data(d, vector<float>{0.0f});
auto e = backend->create_tensor(element::f32, Shape{1});
copy_data(e, vector<float>{255.0f});
auto e_a = backend->create_tensor(element::f32, Shape{1});
copy_data(e_a, vector<float>{-127.0f});
auto g = backend->create_tensor(element::f32, Shape{1});
copy_data(g, vector<float>{127.0f});
auto h = backend->create_tensor(element::f32, Shape{1});
copy_data(h, vector<float>{22.0f});
auto i = backend->create_tensor(element::f32, Shape{1});
copy_data(i, vector<float>{90.0f});
auto result = backend->create_tensor(element::i8, shape_r);
backend->call_with_validate(backend->compile(f), {result}, {a, b, d, e, e_a, g, h, i});
EXPECT_EQ((vector<int8_t>{31, 48, 42, 45, 54, 102, 127, 61, 47, 74, 61, 55}),
read_vector<int8_t>(result));
}
TEST(builder, scaled_QC_with_relu) TEST(builder, scaled_QC_with_relu)
{ {
Shape shape_a{1, 1, 3, 3}; // input shape Shape shape_a{1, 1, 3, 3}; // input shape
...@@ -229,6 +282,58 @@ TEST(builder, scaled_QC_with_relu) ...@@ -229,6 +282,58 @@ TEST(builder, scaled_QC_with_relu)
EXPECT_EQ((vector<uint8_t>{0, 0, 0, 0, 0, 0, 138, 212, 181}), read_vector<uint8_t>(result)); EXPECT_EQ((vector<uint8_t>{0, 0, 0, 0, 0, 0, 138, 212, 181}), read_vector<uint8_t>(result));
} }
TEST(builder, dynamic_scaled_QC_with_relu)
{
Shape shape_a{1, 1, 3, 3}; // input shape
Shape shape_b{1, 1, 3, 3}; // filter shape
Shape shape_r{1, 1, 3, 3}; // output shape
vector<uint8_t> a_data = {1, 2, 3, 4, 5, 6, 7, 8, 9};
vector<int8_t> b_data = {1, 2, 1, 0, 0, 0, -1, -2, -1};
auto A = make_shared<op::Parameter>(element::u8, shape_a);
auto B = make_shared<op::Parameter>(element::i8, shape_b);
auto C = make_shared<op::Parameter>(element::f32, Shape{1});
auto D = make_shared<op::Parameter>(element::f32, Shape{1});
auto E = make_shared<op::Parameter>(element::f32, Shape{1});
auto F = make_shared<op::Parameter>(element::f32, Shape{1});
auto G = make_shared<op::Parameter>(element::f32, Shape{1});
auto H = make_shared<op::Parameter>(element::f32, Shape{1});
auto CV = ngraph::builder::ScaledQuantizedConvolutionRelu(A,
B,
Strides{1, 1}, // move_strides
Strides{1, 1}, // filter_dilation
CoordinateDiff{1, 1}, // below_pads
CoordinateDiff{1, 1}, // above_pads
Strides{1, 1}, // data_dilation
C,
D,
E,
F,
G,
H);
auto f = make_shared<Function>(NodeVector{CV}, ParameterVector{A, B, C, D, E, F, G, H});
auto backend = runtime::Backend::create("CPU");
// Create some tensors for input/output
auto a = backend->create_tensor(element::u8, shape_a);
copy_data(a, a_data);
auto b = backend->create_tensor(element::i8, shape_b);
copy_data(b, b_data);
auto d = backend->create_tensor(element::f32, Shape{1});
copy_data(d, vector<float>{0.0f});
auto e = backend->create_tensor(element::f32, Shape{1});
copy_data(e, vector<float>{255.0f});
auto e_a = backend->create_tensor(element::f32, Shape{1});
copy_data(e_a, vector<float>{-127.0f});
auto g = backend->create_tensor(element::f32, Shape{1});
copy_data(g, vector<float>{127.0f});
auto h = backend->create_tensor(element::f32, Shape{1});
copy_data(h, vector<float>{20.0f});
auto i = backend->create_tensor(element::f32, Shape{1});
copy_data(i, vector<float>{-24.0f});
auto result = backend->create_tensor(element::u8, shape_r);
backend->call_with_validate(backend->compile(f), {result}, {a, b, d, e, e_a, g, h, i});
EXPECT_EQ((vector<uint8_t>{0, 0, 0, 0, 0, 0, 138, 212, 181}), read_vector<uint8_t>(result));
}
TEST(builder, scaled_QC_with_bias) TEST(builder, scaled_QC_with_bias)
{ {
Shape shape_a{1, 1, 3, 4}; // input shape Shape shape_a{1, 1, 3, 4}; // input shape
...@@ -277,6 +382,64 @@ TEST(builder, scaled_QC_with_bias) ...@@ -277,6 +382,64 @@ TEST(builder, scaled_QC_with_bias)
read_vector<int8_t>(result)); read_vector<int8_t>(result));
} }
TEST(builder, dynamic_scaled_QC_with_bias)
{
Shape shape_a{1, 1, 3, 4}; // input shape
Shape shape_b{1, 1, 3, 3}; // filter shape
Shape shape_r{1, 1, 3, 4}; // output shape
vector<uint8_t> a_data = {1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4};
vector<int8_t> b_data = {1, 2, 3, 4, 5, 0, 0, 1, 2};
vector<int32_t> c_data = {5};
auto A = make_shared<op::Parameter>(element::u8, shape_a);
auto B = make_shared<op::Parameter>(element::i8, shape_b);
auto Bias = make_shared<op::Parameter>(element::i32, Shape{1});
auto C = make_shared<op::Parameter>(element::f32, Shape{1});
auto D = make_shared<op::Parameter>(element::f32, Shape{1});
auto E = make_shared<op::Parameter>(element::f32, Shape{1});
auto F = make_shared<op::Parameter>(element::f32, Shape{1});
auto G = make_shared<op::Parameter>(element::f32, Shape{1});
auto H = make_shared<op::Parameter>(element::f32, Shape{1});
auto CV = ngraph::builder::ScaledQuantizedConvolutionBias(A,
B,
Bias,
Strides{1, 1}, // move_strides
Strides{1, 1}, // filter_dilation
CoordinateDiff{1, 1}, // below_pads
CoordinateDiff{1, 1}, // above_pads
Strides{1, 1}, // data_dilation
C,
D,
E,
F,
G,
H);
auto f = make_shared<Function>(NodeVector{CV}, ParameterVector{A, B, Bias, C, D, E, F, G, H});
auto backend = runtime::Backend::create("CPU");
// Create some tensors for input/output
auto a = backend->create_tensor(element::u8, shape_a);
copy_data(a, a_data);
auto b = backend->create_tensor(element::i8, shape_b);
copy_data(b, b_data);
auto c = backend->create_tensor(element::i32, Shape{1});
copy_data(c, c_data);
auto d = backend->create_tensor(element::f32, Shape{1});
copy_data(d, vector<float>{0.0f});
auto e = backend->create_tensor(element::f32, Shape{1});
copy_data(e, vector<float>{255.0f});
auto e_a = backend->create_tensor(element::f32, Shape{1});
copy_data(e_a, vector<float>{-127.0f});
auto g = backend->create_tensor(element::f32, Shape{1});
copy_data(g, vector<float>{127.0f});
auto h = backend->create_tensor(element::f32, Shape{1});
copy_data(h, vector<float>{22.0f});
auto i = backend->create_tensor(element::f32, Shape{1});
copy_data(i, vector<float>{90.0f});
auto result = backend->create_tensor(element::i8, shape_r);
backend->call_with_validate(backend->compile(f), {result}, {a, b, c, d, e, e_a, g, h, i});
EXPECT_EQ((vector<int8_t>{38, 55, 50, 52, 61, 109, 127, 68, 54, 81, 68, 62}),
read_vector<int8_t>(result));
}
TEST(builder, scaled_QC_with_bias_and_relu) TEST(builder, scaled_QC_with_bias_and_relu)
{ {
Shape shape_a{1, 1, 3, 3}; // input shape Shape shape_a{1, 1, 3, 3}; // input shape
...@@ -383,6 +546,80 @@ TEST(builder, scaled_QC_with_bias_add_and_relu) ...@@ -383,6 +546,80 @@ TEST(builder, scaled_QC_with_bias_add_and_relu)
read_vector<uint8_t>(result)); read_vector<uint8_t>(result));
} }
TEST(builder, dynamic_scaled_QC_with_bias_add_and_relu)
{
Shape shape_a{1, 1, 3, 4}; // input shape
Shape shape_b{1, 1, 3, 3}; // filter shape
Shape shape_r{1, 1, 3, 4}; // output shape
vector<uint8_t> a_data = {1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4};
vector<int8_t> b_data = {1, 2, 3, 4, 5, 0, 0, 1, 2};
vector<int32_t> c_data = {5};
vector<uint8_t> conv_2_data = {1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4};
auto A = make_shared<op::Parameter>(element::u8, shape_a);
auto B = make_shared<op::Parameter>(element::i8, shape_b);
auto Add = make_shared<op::Parameter>(element::u8, shape_a);
auto Bias = make_shared<op::Parameter>(element::i32, Shape{1});
auto C = make_shared<op::Parameter>(element::f32, Shape{1});
auto D = make_shared<op::Parameter>(element::f32, Shape{1});
auto E = make_shared<op::Parameter>(element::f32, Shape{1});
auto F = make_shared<op::Parameter>(element::f32, Shape{1});
auto G = make_shared<op::Parameter>(element::f32, Shape{1});
auto H = make_shared<op::Parameter>(element::f32, Shape{1});
auto I = make_shared<op::Parameter>(element::f32, Shape{1});
auto J = make_shared<op::Parameter>(element::f32, Shape{1});
auto CV = ngraph::builder::ScaledQuantizedConvolutionBiasAdd(A,
B,
Bias,
Add,
Strides{1, 1}, // move_strides
Strides{1, 1}, // filter_dilation
CoordinateDiff{1, 1}, // below_pads
CoordinateDiff{1, 1}, // above_pads
Strides{1, 1}, // data_dilation
C,
D,
E,
F,
G,
H,
I,
J,
true);
auto f = make_shared<Function>(NodeVector{CV},
ParameterVector{A, B, Bias, Add, C, D, E, F, G, H, I, J});
auto backend = runtime::Backend::create("CPU");
// Create some tensors for input/output
auto a = backend->create_tensor(element::u8, shape_a);
copy_data(a, a_data);
auto b = backend->create_tensor(element::i8, shape_b);
copy_data(b, b_data);
auto c = backend->create_tensor(element::i32, Shape{1});
copy_data(c, c_data);
auto d = backend->create_tensor(element::u8, shape_a);
copy_data(d, conv_2_data);
auto e = backend->create_tensor(element::f32, Shape{1});
copy_data(e, vector<float>{0.0f});
auto e_a = backend->create_tensor(element::f32, Shape{1});
copy_data(e_a, vector<float>{255.0f});
auto g = backend->create_tensor(element::f32, Shape{1});
copy_data(g, vector<float>{-127.0f});
auto h = backend->create_tensor(element::f32, Shape{1});
copy_data(h, vector<float>{127.0f});
auto i = backend->create_tensor(element::f32, Shape{1});
copy_data(i, vector<float>{22.0f});
auto j = backend->create_tensor(element::f32, Shape{1});
copy_data(j, vector<float>{90.0f});
auto k = backend->create_tensor(element::f32, Shape{1});
copy_data(k, vector<float>{22.0f});
auto l = backend->create_tensor(element::f32, Shape{1});
copy_data(l, vector<float>{180.0f});
auto result = backend->create_tensor(element::u8, shape_r);
auto handle = backend->compile(f);
backend->call_with_validate(handle, {result}, {a, b, c, d, e, e_a, g, h, i, j, k, l});
EXPECT_EQ((vector<uint8_t>{78, 114, 105, 113, 132, 230, 255, 136, 110, 165, 142, 133}),
read_vector<uint8_t>(result));
}
TEST(builder, scaled_QC_with_bias_signed_add_and_relu) TEST(builder, scaled_QC_with_bias_signed_add_and_relu)
{ {
Shape shape_a{1, 1, 3, 4}; // input shape Shape shape_a{1, 1, 3, 4}; // input shape
...@@ -442,6 +679,81 @@ TEST(builder, scaled_QC_with_bias_signed_add_and_relu) ...@@ -442,6 +679,81 @@ TEST(builder, scaled_QC_with_bias_signed_add_and_relu)
read_vector<uint8_t>(result)); read_vector<uint8_t>(result));
} }
TEST(builder, dynamic_scaled_QC_with_bias_signed_add_and_relu)
{
Shape shape_a{1, 1, 3, 4}; // input shape
Shape shape_b{1, 1, 3, 3}; // filter shape
Shape shape_r{1, 1, 3, 4}; // output shape
vector<uint8_t> a_data = {1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4};
vector<int8_t> b_data = {1, 2, 3, 4, 5, 0, 0, 1, 2};
vector<int32_t> c_data = {5};
vector<int8_t> conv_2_data = {-1, -2, -3, -4, -5, -6, -10, 0, 1, 2, 3, 4};
auto A = make_shared<op::Parameter>(element::u8, shape_a);
auto B = make_shared<op::Parameter>(element::i8, shape_b);
auto Add = make_shared<op::Parameter>(element::i8, shape_a);
auto Bias = make_shared<op::Parameter>(element::i32, Shape{1});
auto C = make_shared<op::Parameter>(element::f32, Shape{1});
auto D = make_shared<op::Parameter>(element::f32, Shape{1});
auto E = make_shared<op::Parameter>(element::f32, Shape{1});
auto F = make_shared<op::Parameter>(element::f32, Shape{1});
auto G = make_shared<op::Parameter>(element::f32, Shape{1});
auto H = make_shared<op::Parameter>(element::f32, Shape{1});
auto I = make_shared<op::Parameter>(element::f32, Shape{1});
auto J = make_shared<op::Parameter>(element::f32, Shape{1});
auto CV =
ngraph::builder::ScaledQuantizedConvolutionBiasSignedAdd(A,
B,
Bias,
Add,
Strides{1, 1}, // move_strides
Strides{1, 1}, // filter_dilation
CoordinateDiff{1, 1}, // below_pads
CoordinateDiff{1, 1}, // above_pads
Strides{1, 1}, // data_dilation
C,
D,
E,
F,
G,
H,
I,
J,
true);
auto f = make_shared<Function>(NodeVector{CV},
ParameterVector{A, B, Bias, Add, C, D, E, F, G, H, I, J});
auto backend = runtime::Backend::create("CPU");
// Create some tensors for input/output
auto a = backend->create_tensor(element::u8, shape_a);
copy_data(a, a_data);
auto b = backend->create_tensor(element::i8, shape_b);
copy_data(b, b_data);
auto c = backend->create_tensor(element::i32, Shape{1});
copy_data(c, c_data);
auto d = backend->create_tensor(element::i8, shape_a);
copy_data(d, conv_2_data);
auto e = backend->create_tensor(element::f32, Shape{1});
copy_data(e, vector<float>{0.0f});
auto e_a = backend->create_tensor(element::f32, Shape{1});
copy_data(e_a, vector<float>{255.0f});
auto g = backend->create_tensor(element::f32, Shape{1});
copy_data(g, vector<float>{-127.0f});
auto h = backend->create_tensor(element::f32, Shape{1});
copy_data(h, vector<float>{127.0f});
auto i = backend->create_tensor(element::f32, Shape{1});
copy_data(i, vector<float>{22.0f});
auto j = backend->create_tensor(element::f32, Shape{1});
copy_data(j, vector<float>{90.0f});
auto k = backend->create_tensor(element::f32, Shape{1});
copy_data(k, vector<float>{22.0f});
auto l = backend->create_tensor(element::f32, Shape{1});
copy_data(l, vector<float>{90.0f});
auto result = backend->create_tensor(element::u8, shape_r);
auto handle = backend->compile(f);
backend->call_with_validate(handle, {result}, {a, b, c, d, e, e_a, g, h, i, j, k, l});
EXPECT_EQ((vector<uint8_t>{76, 110, 99, 105, 122, 218, 255, 136, 110, 165, 142, 133}),
read_vector<uint8_t>(result));
}
TEST(builder, scaled_QC_with_f32_bias_and_relu) TEST(builder, scaled_QC_with_f32_bias_and_relu)
{ {
Shape shape_a{1, 1, 3, 3}; // input shape Shape shape_a{1, 1, 3, 3}; // input shape
...@@ -512,29 +824,72 @@ TEST(builder, scaled_Q_unsigned) ...@@ -512,29 +824,72 @@ TEST(builder, scaled_Q_unsigned)
EXPECT_EQ((vector<uint8_t>{0, 0, 1, 1, 2, 64, 127, 255}), read_vector<uint8_t>(result)); EXPECT_EQ((vector<uint8_t>{0, 0, 1, 1, 2, 64, 127, 255}), read_vector<uint8_t>(result));
} }
TEST(builder, dynamic_scaled_Q_unsigned) TEST(builder, dynamic_scaled_Q)
{ {
vector<float> a_data = {-255.0, 0.0, 1.0, 1.25, 1.75, 64.0, 127.0, 500.0}; auto call_SQ = [](unique_ptr<runtime::Backend>& backend,
Shape shape_a{8}; element::Type type,
AxisSet quantization_axes; op::Quantize::RoundMode mode,
op::Quantize::RoundMode round_mode = op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_EVEN; Shape in_shape,
auto A = make_shared<op::Parameter>(element::f32, shape_a); vector<float> in,
float min,
float max) {
auto A = make_shared<op::Parameter>(element::f32, in_shape);
auto B = make_shared<op::Parameter>(element::f32, Shape{}); auto B = make_shared<op::Parameter>(element::f32, Shape{});
auto C = make_shared<op::Parameter>(element::f32, Shape{}); auto C = make_shared<op::Parameter>(element::f32, Shape{});
auto QT = ngraph::builder::ScaledQuantize(A, B, C, element::u8, quantization_axes, round_mode); auto QT = ngraph::builder::ScaledQuantize(A, B, C, type, AxisSet{}, mode);
auto f = make_shared<Function>(NodeVector{QT}, ParameterVector{A, B, C}); auto f = make_shared<Function>(NodeVector{QT}, ParameterVector{A, B, C});
auto backend = runtime::Backend::create("CPU");
// Create some tensors for input/output // Create some tensors for input/output
auto a = backend->create_tensor(element::f32, shape_a); auto a = backend->create_tensor(element::f32, in_shape);
auto b = backend->create_tensor(element::f32, Shape{}); auto b = backend->create_tensor(element::f32, Shape{});
auto c = backend->create_tensor(element::f32, Shape{}); auto c = backend->create_tensor(element::f32, Shape{});
copy_data(a, a_data); copy_data(a, in);
copy_data(b, vector<float>{-255.0f}); copy_data(b, vector<float>{min});
copy_data(c, vector<float>{127.0f}); copy_data(c, vector<float>{max});
auto result = backend->create_tensor(element::u8, shape_a); auto result = backend->create_tensor(type, in_shape);
auto handle = backend->compile(f); backend->call_with_validate(backend->compile(f), {result}, {a, b, c});
backend->call_with_validate(handle, {result}, {a, b, c}); return result;
};
auto backend = runtime::Backend::create("CPU");
auto result = call_SQ(backend,
element::u8,
op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_EVEN,
Shape{8},
vector<float>{-255.0, 0.0, 1.0, 1.25, 1.75, 64.0, 127.0, 500.0},
-255.0f,
127.0f);
EXPECT_EQ((vector<uint8_t>{0, 0, 1, 1, 2, 64, 127, 255}), read_vector<uint8_t>(result)); EXPECT_EQ((vector<uint8_t>{0, 0, 1, 1, 2, 64, 127, 255}), read_vector<uint8_t>(result));
auto result2 = call_SQ(backend,
element::u8,
op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_EVEN,
Shape{8},
vector<float>{-85.0, 0.0, 2.0, 10.0, 15.0, 127.0, 64.0, 500.0},
-85.0f,
15.0f);
EXPECT_EQ((vector<uint8_t>{0, 0, 6, 30, 45, 255, 192, 255}), read_vector<uint8_t>(result2));
auto result3 = call_SQ(backend,
element::u8,
op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_EVEN,
Shape{2, 2},
vector<float>{0.1392, 0.5928, 0.6027, 0.8579},
0.0f,
1.0f);
EXPECT_EQ((vector<uint8_t>{35, 151, 154, 219}), read_vector<uint8_t>(result3));
auto result4 = call_SQ(backend,
element::i8,
op::Quantize::RoundMode::ROUND_NEAREST_TOWARD_EVEN,
Shape{2, 4},
vector<float>{-1.3990955,
-1.468798,
-2.0760186,
0.17088544,
-0.0829789,
-0.3173087,
-0.5645172,
-0.3188769},
-2.0760186f,
0.17088544f);
EXPECT_EQ((vector<int8_t>{-86, -90, -127, 10, -5, -19, -35, -20}),
read_vector<int8_t>(result4));
} }
TEST(builder, scaled_Q_signed) TEST(builder, scaled_Q_signed)
...@@ -578,3 +933,39 @@ TEST(builder, scaled_DQ_signed) ...@@ -578,3 +933,39 @@ TEST(builder, scaled_DQ_signed)
backend->call_with_validate(handle, {result}, {a}); backend->call_with_validate(handle, {result}, {a});
EXPECT_EQ((vector<float>{99.212601}), read_vector<float>(result)); EXPECT_EQ((vector<float>{99.212601}), read_vector<float>(result));
} }
template <typename T>
shared_ptr<runtime::Tensor> call_SDQ(unique_ptr<runtime::Backend>& backend,
element::Type type,
Shape in_shape,
vector<T> in,
float min,
float max)
{
auto A = make_shared<op::Parameter>(type, in_shape);
auto B = make_shared<op::Parameter>(element::f32, Shape{});
auto C = make_shared<op::Parameter>(element::f32, Shape{});
auto DQT = ngraph::builder::ScaledDequantize(A, B, C, element::f32, AxisSet{});
auto f = make_shared<Function>(NodeVector{DQT}, ParameterVector{A, B, C});
// Create some tensors for input/output
auto a = backend->create_tensor(type, in_shape);
auto b = backend->create_tensor(element::f32, Shape{});
auto c = backend->create_tensor(element::f32, Shape{});
copy_data(a, in);
copy_data(b, vector<float>{min});
copy_data(c, vector<float>{max});
auto result = backend->create_tensor(element::f32, in_shape);
backend->call_with_validate(backend->compile(f), {result}, {a, b, c});
return result;
}
TEST(builder, dynamic_scaled_DQ)
{
auto backend = runtime::Backend::create("CPU");
auto result =
call_SDQ<int8_t>(backend, element::i8, Shape{1}, vector<int8_t>{42}, -1.0f, 300.0f);
EXPECT_EQ((vector<float>{99.212601}), read_vector<float>(result));
auto result2 = call_SDQ<uint8_t>(
backend, element::u8, Shape{2, 2}, vector<uint8_t>{35, 151, 154, 219}, 0.0f, 1.0f);
EXPECT_EQ((vector<float>{0.13725491, 0.59215689, 0.60392159, 0.8588236}),
read_vector<float>(result2));
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment