Commit 18d10d6b authored by Vitaly Tuzov's avatar Vitaly Tuzov

Fixed v_reduce_sad intrinsics implementation and added tests

parent 5c0a98cf
...@@ -1141,12 +1141,16 @@ inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b, ...@@ -1141,12 +1141,16 @@ inline v_float32x8 v_reduce_sum4(const v_float32x8& a, const v_float32x8& b,
inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b) inline unsigned v_reduce_sad(const v_uint8x32& a, const v_uint8x32& b)
{ {
return (unsigned)_v_cvtsi256_si32(_mm256_sad_epu8(a.val, b.val)); __m256i half = _mm256_sad_epu8(a.val, b.val);
__m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
} }
inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b) inline unsigned v_reduce_sad(const v_int8x32& a, const v_int8x32& b)
{ {
__m256i half = _mm256_set1_epi8(0x7f); __m256i half = _mm256_set1_epi8(0x7f);
return (unsigned)_v_cvtsi256_si32(_mm256_sad_epu8(_mm256_add_epi8(a.val, half), _mm256_add_epi8(b.val, half))); half = _mm256_sad_epu8(_mm256_add_epi8(a.val, half), _mm256_add_epi8(b.val, half));
__m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
} }
inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b) inline unsigned v_reduce_sad(const v_uint16x16& a, const v_uint16x16& b)
{ {
......
...@@ -1486,13 +1486,14 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min) ...@@ -1486,13 +1486,14 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b) inline unsigned v_reduce_sad(const v_uint8x16& a, const v_uint8x16& b)
{ {
return (unsigned)_mm_cvtsi128_si32(_mm_sad_epu8(a.val, b.val)); __m128i half = _mm_sad_epu8(a.val, b.val);
return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
} }
inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b) inline unsigned v_reduce_sad(const v_int8x16& a, const v_int8x16& b)
{ {
__m128i half = _mm_set1_epi8(0x7f); __m128i half = _mm_set1_epi8(0x7f);
return (unsigned)_mm_cvtsi128_si32(_mm_sad_epu8(_mm_add_epi8(a.val, half), half = _mm_sad_epu8(_mm_add_epi8(a.val, half), _mm_add_epi8(b.val, half));
_mm_add_epi8(b.val, half))); return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
} }
inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b) inline unsigned v_reduce_sad(const v_uint16x8& a, const v_uint16x8& b)
{ {
......
...@@ -770,6 +770,15 @@ template<typename R> struct TheTest ...@@ -770,6 +770,15 @@ template<typename R> struct TheTest
return *this; return *this;
} }
TheTest & test_reduce_sad()
{
Data<R> dataA, dataB(R::nlanes/2);
R a = dataA;
R b = dataB;
EXPECT_EQ((unsigned)(R::nlanes*R::nlanes/4), v_reduce_sad(a, b));
return *this;
}
TheTest & test_mask() TheTest & test_mask()
{ {
typedef typename V_RegTraits<R>::int_reg int_reg; typedef typename V_RegTraits<R>::int_reg int_reg;
...@@ -1320,6 +1329,7 @@ void test_hal_intrin_uint8() ...@@ -1320,6 +1329,7 @@ void test_hal_intrin_uint8()
.test_logic() .test_logic()
.test_min_max() .test_min_max()
.test_absdiff() .test_absdiff()
.test_reduce_sad()
.test_mask() .test_mask()
.test_popcount() .test_popcount()
.test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>() .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
...@@ -1358,6 +1368,7 @@ void test_hal_intrin_int8() ...@@ -1358,6 +1368,7 @@ void test_hal_intrin_int8()
.test_absdiff() .test_absdiff()
.test_absdiffs() .test_absdiffs()
.test_abs() .test_abs()
.test_reduce_sad()
.test_mask() .test_mask()
.test_popcount() .test_popcount()
.test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>() .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
...@@ -1387,6 +1398,7 @@ void test_hal_intrin_uint16() ...@@ -1387,6 +1398,7 @@ void test_hal_intrin_uint16()
.test_min_max() .test_min_max()
.test_absdiff() .test_absdiff()
.test_reduce() .test_reduce()
.test_reduce_sad()
.test_mask() .test_mask()
.test_popcount() .test_popcount()
.test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>() .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
...@@ -1418,6 +1430,7 @@ void test_hal_intrin_int16() ...@@ -1418,6 +1430,7 @@ void test_hal_intrin_int16()
.test_absdiffs() .test_absdiffs()
.test_abs() .test_abs()
.test_reduce() .test_reduce()
.test_reduce_sad()
.test_mask() .test_mask()
.test_popcount() .test_popcount()
.test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>() .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
...@@ -1446,6 +1459,7 @@ void test_hal_intrin_uint32() ...@@ -1446,6 +1459,7 @@ void test_hal_intrin_uint32()
.test_min_max() .test_min_max()
.test_absdiff() .test_absdiff()
.test_reduce() .test_reduce()
.test_reduce_sad()
.test_mask() .test_mask()
.test_popcount() .test_popcount()
.test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>() .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
...@@ -1473,6 +1487,7 @@ void test_hal_intrin_int32() ...@@ -1473,6 +1487,7 @@ void test_hal_intrin_int32()
.test_min_max() .test_min_max()
.test_absdiff() .test_absdiff()
.test_reduce() .test_reduce()
.test_reduce_sad()
.test_mask() .test_mask()
.test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>() .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
.test_unpack() .test_unpack()
...@@ -1528,6 +1543,7 @@ void test_hal_intrin_float32() ...@@ -1528,6 +1543,7 @@ void test_hal_intrin_float32()
.test_min_max() .test_min_max()
.test_float_absdiff() .test_float_absdiff()
.test_reduce() .test_reduce()
.test_reduce_sad()
.test_mask() .test_mask()
.test_unpack() .test_unpack()
.test_float_math() .test_float_math()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment