Commit 5713b34d authored by Nishant Patel's avatar Nishant Patel Committed by Robert Kimball

Fix for Qconv sum inplace (#2349)

* Fix in place

* Style

* Adding non in-place support for all fused convadds

* fixed quantized test case on GPU build
parent ce7e168a
...@@ -186,7 +186,9 @@ namespace ngraph ...@@ -186,7 +186,9 @@ namespace ngraph
auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name()); auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name());
auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name()); auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name()); auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name());
auto& arg3_tensor = external_function->get_tensor_data(args[3].get_name());
auto& out_tensor = external_function->get_tensor_data(out[0].get_name()); auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
size_t arg3_size = args[3].get_size();
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node)) if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{ {
...@@ -196,8 +198,14 @@ namespace ngraph ...@@ -196,8 +198,14 @@ namespace ngraph
node, args, out); node, args, out);
auto& deps = mkldnn_emitter->get_primitive_deps(conv_index); auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
auto functor = [&, conv_index](CPURuntimeContext* ctx, auto functor = [&, conv_index, arg3_size](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) { CPUExecutionContext* ectx) {
if (out_tensor != arg3_tensor)
{
memcpy(static_cast<char*>(out_tensor),
static_cast<char*>(arg3_tensor),
arg3_size);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], arg2_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], arg2_tensor);
...@@ -219,7 +227,9 @@ namespace ngraph ...@@ -219,7 +227,9 @@ namespace ngraph
auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name()); auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name());
auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name()); auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name());
auto& out_tensor = external_function->get_tensor_data(out[0].get_name()); auto& out_tensor = external_function->get_tensor_data(out[0].get_name());
size_t arg2_size = args[2].get_size();
if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node)) if (runtime::cpu::mkldnn_utils::use_mkldnn_kernel(node))
{ {
...@@ -228,8 +238,14 @@ namespace ngraph ...@@ -228,8 +238,14 @@ namespace ngraph
node, args, out); node, args, out);
auto& deps = mkldnn_emitter->get_primitive_deps(conv_index); auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
auto functor = [&, conv_index](CPURuntimeContext* ctx, auto functor = [&, conv_index, arg2_size](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) { CPUExecutionContext* ectx) {
if (out_tensor != arg2_tensor)
{
memcpy(static_cast<char*>(out_tensor),
static_cast<char*>(arg2_tensor),
arg2_size);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], out_tensor);
......
...@@ -193,14 +193,17 @@ namespace ngraph ...@@ -193,14 +193,17 @@ namespace ngraph
auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name()); auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name());
auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name()); auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name()); auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name());
auto& arg3_tensor = external_function->get_tensor_data(args[3].get_name());
auto& arg4_tensor = external_function->get_tensor_data(args[4].get_name()); auto& arg4_tensor = external_function->get_tensor_data(args[4].get_name());
auto& arg5_tensor = external_function->get_tensor_data(args[5].get_name()); auto& arg5_tensor = external_function->get_tensor_data(args[5].get_name());
auto& out0_tensor = external_function->get_tensor_data(out[0].get_name()); auto& out0_tensor = external_function->get_tensor_data(out[0].get_name());
auto& mkldnn_emitter = external_function->get_mkldnn_emitter(); size_t arg3_size = args[3].get_size();
auto scales_size = shape_size(args[4].get_shape()); auto scales_size = shape_size(args[4].get_shape());
auto sum_scales_size = shape_size(args[5].get_shape()); auto sum_scales_size = shape_size(args[5].get_shape());
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto conv_desc = auto conv_desc =
mkldnn_emitter mkldnn_emitter
->get_convolution_forward_desc<ngraph::op::QuantizedConvolutionBiasAdd>( ->get_convolution_forward_desc<ngraph::op::QuantizedConvolutionBiasAdd>(
...@@ -212,9 +215,15 @@ namespace ngraph ...@@ -212,9 +215,15 @@ namespace ngraph
size_t conv_index = mkldnn_emitter->convolution_forward_init(true); size_t conv_index = mkldnn_emitter->convolution_forward_init(true);
auto& deps = mkldnn_emitter->get_primitive_deps(conv_index); auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
auto functor = auto functor = [&,
[&, scales_size, sum_scales_size, conv_desc, conv_attr, deps, conv_index]( scales_size,
CPURuntimeContext* ctx, CPUExecutionContext* ectx) mutable { sum_scales_size,
conv_desc,
conv_attr,
deps,
conv_index,
arg3_size](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) mutable {
if (ctx->first_iteration) if (ctx->first_iteration)
{ {
vector<float> dyn_scales; vector<float> dyn_scales;
...@@ -230,10 +239,8 @@ namespace ngraph ...@@ -230,10 +239,8 @@ namespace ngraph
{ {
if (old_pops.kind(i) == mkldnn::primitive::kind::eltwise) if (old_pops.kind(i) == mkldnn::primitive::kind::eltwise)
{ {
mkldnn::algorithm alg = mkldnn::algorithm_undef; mkldnn::algorithm alg;
float scale = 0; float scale, alpha, beta;
float alpha = 0;
float beta = 0;
old_pops.get_params_eltwise(i, scale, alg, alpha, beta); old_pops.get_params_eltwise(i, scale, alg, alpha, beta);
new_pops.append_eltwise(scale, alg, alpha, beta); new_pops.append_eltwise(scale, alg, alpha, beta);
} }
...@@ -247,6 +254,13 @@ namespace ngraph ...@@ -247,6 +254,13 @@ namespace ngraph
mkldnn_emitter->convolution_forward<true>( mkldnn_emitter->convolution_forward<true>(
conv_desc, conv_attr, executor::global_cpu_engine, conv_index); conv_desc, conv_attr, executor::global_cpu_engine, conv_index);
} }
if (out0_tensor != arg3_tensor)
{
memcpy(static_cast<char*>(out0_tensor),
static_cast<char*>(arg3_tensor),
arg3_size);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], arg2_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], arg2_tensor);
...@@ -271,14 +285,17 @@ namespace ngraph ...@@ -271,14 +285,17 @@ namespace ngraph
auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name()); auto& arg0_tensor = external_function->get_tensor_data(args[0].get_name());
auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name()); auto& arg1_tensor = external_function->get_tensor_data(args[1].get_name());
auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name()); auto& arg2_tensor = external_function->get_tensor_data(args[2].get_name());
auto& arg3_tensor = external_function->get_tensor_data(args[3].get_name());
auto& arg4_tensor = external_function->get_tensor_data(args[4].get_name()); auto& arg4_tensor = external_function->get_tensor_data(args[4].get_name());
auto& arg5_tensor = external_function->get_tensor_data(args[5].get_name()); auto& arg5_tensor = external_function->get_tensor_data(args[5].get_name());
auto& out0_tensor = external_function->get_tensor_data(out[0].get_name()); auto& out0_tensor = external_function->get_tensor_data(out[0].get_name());
auto& mkldnn_emitter = external_function->get_mkldnn_emitter(); size_t arg3_size = args[3].get_size();
auto scales_size = shape_size(args[4].get_shape()); auto scales_size = shape_size(args[4].get_shape());
auto sum_scales_size = shape_size(args[5].get_shape()); auto sum_scales_size = shape_size(args[5].get_shape());
auto& mkldnn_emitter = external_function->get_mkldnn_emitter();
auto conv_desc = mkldnn_emitter->get_convolution_forward_desc< auto conv_desc = mkldnn_emitter->get_convolution_forward_desc<
ngraph::op::QuantizedConvolutionBiasSignedAdd>(node, args, out); ngraph::op::QuantizedConvolutionBiasSignedAdd>(node, args, out);
auto conv_attr = mkldnn_emitter->get_convolution_forward_attr< auto conv_attr = mkldnn_emitter->get_convolution_forward_attr<
...@@ -286,9 +303,15 @@ namespace ngraph ...@@ -286,9 +303,15 @@ namespace ngraph
size_t conv_index = mkldnn_emitter->convolution_forward_init(true); size_t conv_index = mkldnn_emitter->convolution_forward_init(true);
auto& deps = mkldnn_emitter->get_primitive_deps(conv_index); auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
auto functor = auto functor = [&,
[&, scales_size, sum_scales_size, conv_desc, conv_attr, deps, conv_index]( scales_size,
CPURuntimeContext* ctx, CPUExecutionContext* ectx) mutable { sum_scales_size,
conv_desc,
conv_attr,
deps,
conv_index,
arg3_size](CPURuntimeContext* ctx,
CPUExecutionContext* ectx) mutable {
if (ctx->first_iteration) if (ctx->first_iteration)
{ {
vector<float> dyn_scales; vector<float> dyn_scales;
...@@ -304,10 +327,8 @@ namespace ngraph ...@@ -304,10 +327,8 @@ namespace ngraph
{ {
if (old_pops.kind(i) == mkldnn::primitive::kind::eltwise) if (old_pops.kind(i) == mkldnn::primitive::kind::eltwise)
{ {
mkldnn::algorithm alg = mkldnn::algorithm_undef; mkldnn::algorithm alg;
float scale = 0; float scale, alpha, beta;
float alpha = 0;
float beta = 0;
old_pops.get_params_eltwise(i, scale, alg, alpha, beta); old_pops.get_params_eltwise(i, scale, alg, alpha, beta);
new_pops.append_eltwise(scale, alg, alpha, beta); new_pops.append_eltwise(scale, alg, alpha, beta);
} }
...@@ -321,6 +342,13 @@ namespace ngraph ...@@ -321,6 +342,13 @@ namespace ngraph
mkldnn_emitter->convolution_forward<true>( mkldnn_emitter->convolution_forward<true>(
conv_desc, conv_attr, executor::global_cpu_engine, conv_index); conv_desc, conv_attr, executor::global_cpu_engine, conv_index);
} }
if (out0_tensor != arg3_tensor)
{
memcpy(static_cast<char*>(out0_tensor),
static_cast<char*>(arg3_tensor),
arg3_size);
}
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[0], arg0_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[1], arg1_tensor);
cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], arg2_tensor); cpu::mkldnn_utils::set_memory_ptr(ctx, deps[2], arg2_tensor);
......
...@@ -2410,6 +2410,11 @@ namespace ngraph ...@@ -2410,6 +2410,11 @@ namespace ngraph
node, args, out); node, args, out);
auto& deps = mkldnn_emitter->get_primitive_deps(qconv_index); auto& deps = mkldnn_emitter->get_primitive_deps(qconv_index);
writer << "if (" << out[0].get_name() << " != " << args[3].get_name() << ")\n";
writer.block_begin();
writer << "memcpy(" << out[0].get_name() << ", " << args[3].get_name() << ", "
<< args[3].get_size() * args[3].get_element_type().size() << ");\n";
writer.block_end();
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0]) writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0])
<< ", " << args[0].get_name() << ");\n"; << ", " << args[0].get_name() << ");\n";
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1]) writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1])
...@@ -2441,6 +2446,11 @@ namespace ngraph ...@@ -2441,6 +2446,11 @@ namespace ngraph
node, args, out); node, args, out);
auto& deps = mkldnn_emitter->get_primitive_deps(qconv_index); auto& deps = mkldnn_emitter->get_primitive_deps(qconv_index);
writer << "if (" << out[0].get_name() << " != " << args[3].get_name() << ")\n";
writer.block_begin();
writer << "memcpy(" << out[0].get_name() << ", " << args[3].get_name() << ", "
<< args[3].get_size() * args[3].get_element_type().size() << ");\n";
writer.block_end();
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0]) writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0])
<< ", " << args[0].get_name() << ");\n"; << ", " << args[0].get_name() << ");\n";
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1]) writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1])
...@@ -2500,6 +2510,11 @@ namespace ngraph ...@@ -2500,6 +2510,11 @@ namespace ngraph
node, args, out); node, args, out);
auto& deps = mkldnn_emitter->get_primitive_deps(conv_index); auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
writer << "if (" << out[0].get_name() << " != " << args[3].get_name() << ")\n";
writer.block_begin();
writer << "memcpy(" << out[0].get_name() << ", " << args[3].get_name() << ", "
<< args[3].get_size() * args[3].get_element_type().size() << ");\n";
writer.block_end();
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0]) writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0])
<< ", " << args[0].get_name() << ");\n"; << ", " << args[0].get_name() << ");\n";
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1]) writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1])
...@@ -2527,6 +2542,11 @@ namespace ngraph ...@@ -2527,6 +2542,11 @@ namespace ngraph
node, args, out); node, args, out);
auto& deps = mkldnn_emitter->get_primitive_deps(conv_index); auto& deps = mkldnn_emitter->get_primitive_deps(conv_index);
writer << "if (" << out[0].get_name() << " != " << args[2].get_name() << ")\n";
writer.block_begin();
writer << "memcpy(" << out[0].get_name() << ", " << args[2].get_name() << ", "
<< args[2].get_size() * args[2].get_element_type().size() << ");\n";
writer.block_end();
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0]) writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[0])
<< ", " << args[0].get_name() << ");\n"; << ", " << args[0].get_name() << ");\n";
writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1]) writer << "cpu::mkldnn_utils::set_memory_ptr(ctx, " << to_string(deps[1])
......
...@@ -772,10 +772,18 @@ size_t MKLDNNEmitter::build_reorder(const mkldnn::memory::desc& input_desc, ...@@ -772,10 +772,18 @@ size_t MKLDNNEmitter::build_reorder(const mkldnn::memory::desc& input_desc,
size_t input_index = build_memory_primitive(input_desc); size_t input_index = build_memory_primitive(input_desc);
size_t result_index = build_memory_primitive(result_desc); size_t result_index = build_memory_primitive(result_desc);
size_t primitive_index = insert_primitive( size_t primitive_index = 0;
new mkldnn::reorder(*m_mkldnn_primitives[input_index], *m_mkldnn_primitives[result_index])); try
{
primitive_index = insert_primitive(new mkldnn::reorder(*m_mkldnn_primitives[input_index],
*m_mkldnn_primitives[result_index]));
m_primitive_deps[primitive_index] = {input_index, result_index}; m_primitive_deps[primitive_index] = {input_index, result_index};
}
catch (const mkldnn::error& e)
{
throw ngraph_error("Could not create mkldnn primitive " + e.message);
}
return primitive_index; return primitive_index;
} }
......
...@@ -679,6 +679,68 @@ TEST(builder, scaled_QC_with_bias_signed_add_and_relu) ...@@ -679,6 +679,68 @@ TEST(builder, scaled_QC_with_bias_signed_add_and_relu)
read_vector<uint8_t>(result)); read_vector<uint8_t>(result));
} }
TEST(builder, scaled_QC_with_bias_signed_add_and_relu_nhwc)
{
Shape shape_a{1, 3, 4, 1}; // input shape
Shape shape_b{1, 3, 3, 1}; // filter shape
Shape shape_r{1, 1, 3, 4}; // output shape
vector<uint8_t> a_data = {1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4};
vector<int8_t> b_data = {1, 2, 3, 4, 5, 0, 0, 1, 2};
vector<int32_t> c_data = {5};
vector<int8_t> conv_2_data = {-1, -2, -3, -4, -5, -6, -10, 0, 1, 2, 3, 4};
auto A = make_shared<op::Parameter>(element::u8, shape_a);
auto A_reshape = make_shared<op::Reshape>(A, AxisVector{0, 3, 1, 2}, Shape{1, 1, 3, 4});
auto B = make_shared<op::Parameter>(element::i8, shape_b);
auto B_reshape = make_shared<op::Reshape>(B, AxisVector{0, 3, 1, 2}, Shape{1, 1, 3, 3});
auto Add = make_shared<op::Parameter>(element::i8, shape_a);
auto Add_reshape = make_shared<op::Reshape>(Add, AxisVector{0, 3, 1, 2}, Shape{1, 1, 3, 4});
auto Bias = make_shared<op::Parameter>(element::i32, Shape{1});
auto C = op::Constant::create(element::f32, Shape{}, {0.0f});
auto D = op::Constant::create(element::f32, Shape{}, {255.0f});
auto E = op::Constant::create(element::f32, Shape{}, {-127.0f});
auto F = op::Constant::create(element::f32, Shape{}, {127.0f});
auto G = op::Constant::create(element::f32, Shape{}, {22.0f});
auto H = op::Constant::create(element::f32, Shape{}, {90.0f});
auto I = op::Constant::create(element::f32, Shape{}, {22.0f});
auto J = op::Constant::create(element::f32, Shape{}, {90.0f});
auto CV =
ngraph::builder::ScaledQuantizedConvolutionBiasSignedAdd(A_reshape,
B_reshape,
Bias,
Add_reshape,
Strides{1, 1}, // move_strides
Strides{1, 1}, // filter_dilation
CoordinateDiff{1, 1}, // below_pads
CoordinateDiff{1, 1}, // above_pads
Strides{1, 1}, // data_dilation
C,
D,
E,
F,
G,
H,
I,
J,
true);
auto f = make_shared<Function>(NodeVector{CV}, ParameterVector{A, B, Bias, Add});
constant_fold(f);
auto backend = runtime::Backend::create("CPU");
// Create some tensors for input/output
auto a = backend->create_tensor(element::u8, shape_a);
copy_data(a, a_data);
auto b = backend->create_tensor(element::i8, shape_b);
copy_data(b, b_data);
auto c = backend->create_tensor(element::i32, Shape{1});
copy_data(c, c_data);
auto d = backend->create_tensor(element::i8, shape_a);
copy_data(d, conv_2_data);
auto result = backend->create_tensor(element::u8, shape_r);
auto handle = backend->compile(f);
backend->call_with_validate(handle, {result}, {a, b, c, d});
EXPECT_EQ((vector<uint8_t>{74, 106, 93, 97, 112, 127, 127, 127, 110, 127, 127, 127}),
read_vector<uint8_t>(result));
}
TEST(builder, dynamic_scaled_QC_with_bias_signed_add_and_relu) TEST(builder, dynamic_scaled_QC_with_bias_signed_add_and_relu)
{ {
Shape shape_a{1, 1, 3, 4}; // input shape Shape shape_a{1, 1, 3, 4}; // input shape
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment