Commit 30a60d39 authored by Chip Kerchner's avatar Chip Kerchner Committed by Alexander Alekhin

Merge pull request #15274 from ChipKerchner:lkpyramidToHal

* Convert lkpyramid from SSE SIMD to HAL - 90% faster on Power (VSX).

* Replace stores with reduce_sum.  Rework to handle endianess correctly.

* Fix compiler warnings by casting values explicitly to shorts

* Switch to CV_SIMD128 compiler definition.  Unroll loop to 8 elements since we've already loaded the data.
parent ca7640e1
...@@ -239,13 +239,12 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const ...@@ -239,13 +239,12 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
acctype iA11 = 0, iA12 = 0, iA22 = 0; acctype iA11 = 0, iA12 = 0, iA22 = 0;
float A11, A12, A22; float A11, A12, A22;
#if CV_SSE2 #if CV_SIMD128 && !CV_NEON
__m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); v_int16x8 qw0((short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01));
__m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); v_int16x8 qw1((short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11));
__m128i z = _mm_setzero_si128(); v_int32x4 qdelta_d = v_setall_s32(1 << (W_BITS1-1));
__m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1)); v_int32x4 qdelta = v_setall_s32(1 << (W_BITS1-5-1));
__m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1)); v_float32x4 qA11 = v_setzero_f32(), qA12 = v_setzero_f32(), qA22 = v_setzero_f32();
__m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
#endif #endif
#if CV_NEON #if CV_NEON
...@@ -275,44 +274,75 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const ...@@ -275,44 +274,75 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
x = 0; x = 0;
#if CV_SSE2 #if CV_SIMD128 && !CV_NEON
for( ; x <= winSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 ) for( ; x <= winSize.width*cn - 8; x += 8, dsrc += 8*2, dIptr += 8*2 )
{ {
__m128i v00, v01, v10, v11, t0, t1; v_int32x4 t0, t1;
v_int16x8 v00, v01, v10, v11, t00, t01, t10, t11;
v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z); v00 = v_reinterpret_as_s16(v_load_expand(src + x));
v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + stepI)), z); v01 = v_reinterpret_as_s16(v_load_expand(src + x + cn));
v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + stepI + cn)), z); v10 = v_reinterpret_as_s16(v_load_expand(src + x + stepI));
v11 = v_reinterpret_as_s16(v_load_expand(src + x + stepI + cn));
t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); v_zip(v00, v01, t00, t01);
t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5); v_zip(v10, v11, t10, t11);
_mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
t0 = v_dotprod(t00, qw0, qdelta) + v_dotprod(t10, qw1);
v00 = _mm_loadu_si128((const __m128i*)(dsrc)); t1 = v_dotprod(t01, qw0, qdelta) + v_dotprod(t11, qw1);
v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2)); t0 = t0 >> (W_BITS1-5);
v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep)); t1 = t1 >> (W_BITS1-5);
v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2)); v_store(Iptr + x, v_pack(t0, t1));
t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), v00 = v_reinterpret_as_s16(v_load(dsrc));
_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); v01 = v_reinterpret_as_s16(v_load(dsrc + cn2));
t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), v10 = v_reinterpret_as_s16(v_load(dsrc + dstep));
_mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); v11 = v_reinterpret_as_s16(v_load(dsrc + dstep + cn2));
t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1); v_zip(v00, v01, t00, t01);
v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ... v_zip(v10, v11, t10, t11);
_mm_storeu_si128((__m128i*)dIptr, v00); t0 = v_dotprod(t00, qw0, qdelta_d) + v_dotprod(t10, qw1);
t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3 t1 = v_dotprod(t01, qw0, qdelta_d) + v_dotprod(t11, qw1);
t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3 t0 = t0 >> W_BITS1;
t1 = t1 >> W_BITS1;
__m128 fy = _mm_cvtepi32_ps(t0); v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
__m128 fx = _mm_cvtepi32_ps(t1); v_store(dIptr, v00);
qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy)); v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00))));
qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy)); v_expand(v00, t1, t0);
qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx));
v_float32x4 fy = v_cvt_f32(t0);
v_float32x4 fx = v_cvt_f32(t1);
qA22 = v_muladd(fy, fy, qA22);
qA12 = v_muladd(fx, fy, qA12);
qA11 = v_muladd(fx, fx, qA11);
v00 = v_reinterpret_as_s16(v_load(dsrc + 4*2));
v01 = v_reinterpret_as_s16(v_load(dsrc + 4*2 + cn2));
v10 = v_reinterpret_as_s16(v_load(dsrc + 4*2 + dstep));
v11 = v_reinterpret_as_s16(v_load(dsrc + 4*2 + dstep + cn2));
v_zip(v00, v01, t00, t01);
v_zip(v10, v11, t10, t11);
t0 = v_dotprod(t00, qw0, qdelta_d) + v_dotprod(t10, qw1);
t1 = v_dotprod(t01, qw0, qdelta_d) + v_dotprod(t11, qw1);
t0 = t0 >> W_BITS1;
t1 = t1 >> W_BITS1;
v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
v_store(dIptr + 4*2, v00);
v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00))));
v_expand(v00, t1, t0);
fy = v_cvt_f32(t0);
fx = v_cvt_f32(t1);
qA22 = v_muladd(fy, fy, qA22);
qA12 = v_muladd(fx, fy, qA12);
qA11 = v_muladd(fx, fx, qA11);
} }
#endif #endif
...@@ -419,14 +449,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const ...@@ -419,14 +449,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
} }
} }
#if CV_SSE2 #if CV_SIMD128 && !CV_NEON
float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4]; iA11 += v_reduce_sum(qA11);
_mm_store_ps(A11buf, qA11); iA12 += v_reduce_sum(qA12);
_mm_store_ps(A12buf, qA12); iA22 += v_reduce_sum(qA22);
_mm_store_ps(A22buf, qA22);
iA11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
iA12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
iA22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
#endif #endif
#if CV_NEON #if CV_NEON
...@@ -479,10 +505,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const ...@@ -479,10 +505,10 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
iw11 = (1 << W_BITS) - iw00 - iw01 - iw10; iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
acctype ib1 = 0, ib2 = 0; acctype ib1 = 0, ib2 = 0;
float b1, b2; float b1, b2;
#if CV_SSE2 #if CV_SIMD128 && !CV_NEON
qw0 = _mm_set1_epi32(iw00 + (iw01 << 16)); qw0 = v_int16x8((short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01), (short)(iw00), (short)(iw01));
qw1 = _mm_set1_epi32(iw10 + (iw11 << 16)); qw1 = v_int16x8((short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11), (short)(iw10), (short)(iw11));
__m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps(); v_float32x4 qb0 = v_setzero_f32(), qb1 = v_setzero_f32();
#endif #endif
#if CV_NEON #if CV_NEON
...@@ -503,34 +529,32 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const ...@@ -503,34 +529,32 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
x = 0; x = 0;
#if CV_SSE2 #if CV_SIMD128 && !CV_NEON
for( ; x <= winSize.width*cn - 8; x += 8, dIptr += 8*2 ) for( ; x <= winSize.width*cn - 8; x += 8, dIptr += 8*2 )
{ {
__m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)), diff1; v_int16x8 diff0 = v_reinterpret_as_s16(v_load(Iptr + x)), diff1, diff2;
__m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z); v_int16x8 v00 = v_reinterpret_as_s16(v_load_expand(Jptr + x));
__m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z); v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
__m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + stepJ)), z); v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr + x + stepJ));
__m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + stepJ + cn)), z); v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr + x + stepJ + cn));
__m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0), v_int32x4 t0, t1;
_mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1)); v_int16x8 t00, t01, t10, t11;
__m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0), v_zip(v00, v01, t00, t01);
_mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1)); v_zip(v10, v11, t10, t11);
t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5); t0 = v_dotprod(t00, qw0, qdelta) + v_dotprod(t10, qw1);
diff0 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0); t1 = v_dotprod(t01, qw0, qdelta) + v_dotprod(t11, qw1);
diff1 = _mm_unpackhi_epi16(diff0, diff0); t0 = t0 >> (W_BITS1-5);
diff0 = _mm_unpacklo_epi16(diff0, diff0); // It0 It0 It1 It1 ... t1 = t1 >> (W_BITS1-5);
v00 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ... diff0 = v_pack(t0, t1) - diff0;
v01 = _mm_loadu_si128((const __m128i*)(dIptr + 8)); v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
v10 = _mm_unpacklo_epi16(v00, v01); v00 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
v11 = _mm_unpackhi_epi16(v00, v01); v01 = v_reinterpret_as_s16(v_load(dIptr + 8));
v00 = _mm_unpacklo_epi16(diff0, diff1); v_zip(v00, v01, v10, v11);
v01 = _mm_unpackhi_epi16(diff0, diff1); v_zip(diff2, diff1, v00, v01);
v00 = _mm_madd_epi16(v00, v10); qb0 += v_cvt_f32(v_dotprod(v00, v10));
v11 = _mm_madd_epi16(v01, v11); qb1 += v_cvt_f32(v_dotprod(v01, v11));
qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v11));
} }
#endif #endif
...@@ -616,11 +640,11 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const ...@@ -616,11 +640,11 @@ void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
} }
} }
#if CV_SSE2 #if CV_SIMD128 && !CV_NEON
float CV_DECL_ALIGNED(16) bbuf[4]; v_float32x4 qf0, qf1;
_mm_store_ps(bbuf, _mm_add_ps(qb0, qb1)); v_recombine(v_interleave_pairs(qb0 + qb1), v_setzero_f32(), qf0, qf1);
ib1 += bbuf[0] + bbuf[2]; ib1 += v_reduce_sum(qf0);
ib2 += bbuf[1] + bbuf[3]; ib2 += v_reduce_sum(qf1);
#endif #endif
#if CV_NEON #if CV_NEON
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment