Commit 3ea2586a authored by amatyuko's avatar amatyuko

Fix for SSE2 intrinsics problem in the part of saturation arithmetic processing…

Fix for SSE2 intrinsics problem in the part of saturation arithmetic processing during 32s->16u packed conversion -
for some big negative values less than -INT_MAX+32767 the sign of the numbers is lost due to overflow that leads to
incorrect saturation to MAX value, instead of zero.
The issue is not reproduced with CV_ENABLED_INTRINSICS=OFF
parent 9433784e
......@@ -494,7 +494,12 @@ void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
{
__m128i delta32 = _mm_set1_epi32(32768);
__m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32));
// preliminary saturate negative values to zero
__m128i a1 = _mm_and_si128(a.val, _mm_cmpgt_epi32(a.val, _mm_set1_epi32(0)));
__m128i b1 = _mm_and_si128(b.val, _mm_cmpgt_epi32(b.val, _mm_set1_epi32(0)));
__m128i r = _mm_packs_epi32(_mm_sub_epi32(a1, delta32), _mm_sub_epi32(b1, delta32));
return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
}
......
......@@ -453,9 +453,9 @@ struct Cvt_SIMD<int, uchar>
{
v_int32x4 v_src1 = v_load(src + x), v_src2 = v_load(src + x + cWidth);
v_int32x4 v_src3 = v_load(src + x + cWidth * 2), v_src4 = v_load(src + x + cWidth * 3);
v_uint16x8 v_dst1 = v_pack_u(v_src1, v_src2);
v_uint16x8 v_dst2 = v_pack_u(v_src3, v_src4);
v_store(dst + x, v_pack(v_dst1, v_dst2));
v_int16x8 v_dst1 = v_pack(v_src1, v_src2);
v_int16x8 v_dst2 = v_pack(v_src3, v_src4);
v_store(dst + x, v_pack_u(v_dst1, v_dst2));
}
}
return x;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment