Commit 84fd8190 authored by Alexander Alekhin's avatar Alexander Alekhin

Merge pull request #14232 from terfendail:popcount_rework

parents 88996323 7a55f2af
...@@ -1015,6 +1015,34 @@ OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd ...@@ -1015,6 +1015,34 @@ OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd
////////// Reduce and mask ///////// ////////// Reduce and mask /////////
/** Reduce **/ /** Reduce **/
inline unsigned v_reduce_sum(const v_uint8x32& a)
{
__m256i half = _mm256_sad_epu8(a.val, _mm256_setzero_si256());
__m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
}
inline int v_reduce_sum(const v_int8x32& a)
{
__m256i half = _mm256_sad_epu8(_mm256_xor_si256(a.val, _mm256_set1_epi8((schar)-128)), _mm256_setzero_si256());
__m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter))) - 4096;
}
#define OPENCV_HAL_IMPL_AVX_REDUCE_32(_Tpvec, sctype, func, intrin) \
inline sctype v_reduce_##func(const _Tpvec& a) \
{ \
__m128i val = intrin(_v256_extract_low(a.val), _v256_extract_high(a.val)); \
val = intrin(val, _mm_srli_si128(val,8)); \
val = intrin(val, _mm_srli_si128(val,4)); \
val = intrin(val, _mm_srli_si128(val,2)); \
val = intrin(val, _mm_srli_si128(val,1)); \
return (sctype)_mm_cvtsi128_si32(val); \
}
OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar, min, _mm_min_epu8)
OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32, schar, min, _mm_min_epi8)
OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar, max, _mm_max_epu8)
OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32, schar, max, _mm_max_epi8)
#define OPENCV_HAL_IMPL_AVX_REDUCE_16(_Tpvec, sctype, func, intrin) \ #define OPENCV_HAL_IMPL_AVX_REDUCE_16(_Tpvec, sctype, func, intrin) \
inline sctype v_reduce_##func(const _Tpvec& a) \ inline sctype v_reduce_##func(const _Tpvec& a) \
{ \ { \
...@@ -1062,31 +1090,6 @@ OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8, int, max, _mm_max_epi32) ...@@ -1062,31 +1090,6 @@ OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8, int, max, _mm_max_epi32)
OPENCV_HAL_IMPL_AVX_REDUCE_FLT(min, _mm_min_ps) OPENCV_HAL_IMPL_AVX_REDUCE_FLT(min, _mm_min_ps)
OPENCV_HAL_IMPL_AVX_REDUCE_FLT(max, _mm_max_ps) OPENCV_HAL_IMPL_AVX_REDUCE_FLT(max, _mm_max_ps)
inline ushort v_reduce_sum(const v_uint16x16& a)
{
__m128i a0 = _v256_extract_low(a.val);
__m128i a1 = _v256_extract_high(a.val);
__m128i s0 = _mm_adds_epu16(a0, a1);
s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 2));
return (ushort)_mm_cvtsi128_si32(s0);
}
inline short v_reduce_sum(const v_int16x16& a)
{
__m256i s0 = _mm256_hadds_epi16(a.val, a.val);
s0 = _mm256_hadds_epi16(s0, s0);
s0 = _mm256_hadds_epi16(s0, s0);
__m128i s1 = _v256_extract_high(s0);
s1 = _mm_adds_epi16(_v256_extract_low(s0), s1);
return (short)_mm_cvtsi128_si32(s1);
}
inline int v_reduce_sum(const v_int32x8& a) inline int v_reduce_sum(const v_int32x8& a)
{ {
__m256i s0 = _mm256_hadd_epi32(a.val, a.val); __m256i s0 = _mm256_hadd_epi32(a.val, a.val);
...@@ -1101,6 +1104,11 @@ inline int v_reduce_sum(const v_int32x8& a) ...@@ -1101,6 +1104,11 @@ inline int v_reduce_sum(const v_int32x8& a)
inline unsigned v_reduce_sum(const v_uint32x8& a) inline unsigned v_reduce_sum(const v_uint32x8& a)
{ return v_reduce_sum(v_reinterpret_as_s32(a)); } { return v_reduce_sum(v_reinterpret_as_s32(a)); }
inline int v_reduce_sum(const v_int16x16& a)
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
inline unsigned v_reduce_sum(const v_uint16x16& a)
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
inline float v_reduce_sum(const v_float32x8& a) inline float v_reduce_sum(const v_float32x8& a)
{ {
__m256 s0 = _mm256_hadd_ps(a.val, a.val); __m256 s0 = _mm256_hadd_ps(a.val, a.val);
...@@ -1112,6 +1120,18 @@ inline float v_reduce_sum(const v_float32x8& a) ...@@ -1112,6 +1120,18 @@ inline float v_reduce_sum(const v_float32x8& a)
return _mm_cvtss_f32(s1); return _mm_cvtss_f32(s1);
} }
inline uint64 v_reduce_sum(const v_uint64x4& a)
{
uint64 CV_DECL_ALIGNED(32) idx[2];
_mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
return idx[0] + idx[1];
}
inline int64 v_reduce_sum(const v_int64x4& a)
{
int64 CV_DECL_ALIGNED(32) idx[2];
_mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
return idx[0] + idx[1];
}
inline double v_reduce_sum(const v_float64x4& a) inline double v_reduce_sum(const v_float64x4& a)
{ {
__m256d s0 = _mm256_hadd_pd(a.val, a.val); __m256d s0 = _mm256_hadd_pd(a.val, a.val);
...@@ -1166,26 +1186,39 @@ inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b) ...@@ -1166,26 +1186,39 @@ inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
} }
/** Popcount **/ /** Popcount **/
#define OPENCV_HAL_IMPL_AVX_POPCOUNT(_Tpvec) \ inline v_uint8x32 v_popcount(const v_uint8x32& a)
inline v_uint32x8 v_popcount(const _Tpvec& a) \ {
{ \ __m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
const v_uint32x8 m1 = v256_setall_u32(0x55555555); \ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
const v_uint32x8 m2 = v256_setall_u32(0x33333333); \ __m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
const v_uint32x8 m4 = v256_setall_u32(0x0f0f0f0f); \ return v_uint8x32(_mm256_add_epi8(_mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256( a.val , _popcnt_mask)),
v_uint32x8 p = v_reinterpret_as_u32(a); \ _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_mm256_srli_epi16(a.val, 4), _popcnt_mask))));
p = ((p >> 1) & m1) + (p & m1); \ }
p = ((p >> 2) & m2) + (p & m2); \ inline v_uint16x16 v_popcount(const v_uint16x16& a)
p = ((p >> 4) & m4) + (p & m4); \ {
p.val = _mm256_sad_epu8(p.val, _mm256_setzero_si256()); \ v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
return p; \ p += v_rotate_right<1>(p);
} return v_reinterpret_as_u16(p) & v256_setall_u16(0x00ff);
}
OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint8x32) inline v_uint32x8 v_popcount(const v_uint32x8& a)
OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int8x32) {
OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint16x16) v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int16x16) p += v_rotate_right<1>(p);
OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint32x8) p += v_rotate_right<2>(p);
OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int32x8) return v_reinterpret_as_u32(p) & v256_setall_u32(0x000000ff);
}
inline v_uint64x4 v_popcount(const v_uint64x4& a)
{
return v_uint64x4(_mm256_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm256_setzero_si256()));
}
inline v_uint8x32 v_popcount(const v_int8x32& a)
{ return v_popcount(v_reinterpret_as_u8(a)); }
inline v_uint16x16 v_popcount(const v_int16x16& a)
{ return v_popcount(v_reinterpret_as_u16(a)); }
inline v_uint32x8 v_popcount(const v_int32x8& a)
{ return v_popcount(v_reinterpret_as_u32(a)); }
inline v_uint64x4 v_popcount(const v_int64x4& a)
{ return v_popcount(v_reinterpret_as_u64(a)); }
/** Mask **/ /** Mask **/
inline int v_signmask(const v_int8x32& a) inline int v_signmask(const v_int8x32& a)
......
...@@ -603,27 +603,20 @@ static const unsigned char popCountTable[] = ...@@ -603,27 +603,20 @@ static const unsigned char popCountTable[] =
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
}; };
/** @brief Count the 1 bits in the vector and return 4 values /** @brief Count the 1 bits in the vector lanes and return result as corresponding unsigned type
Scheme: Scheme:
@code @code
{A1 A2 A3 ...} => popcount(A1) {A1 A2 A3 ...} => {popcount(A1), popcount(A2), popcount(A3), ...}
@endcode @endcode
Any types but result will be in v_uint32x4*/ For all integer types. */
template<typename _Tp, int n> inline v_uint32x4 v_popcount(const v_reg<_Tp, n>& a) template<typename _Tp, int n>
inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_popcount(const v_reg<_Tp, n>& a)
{ {
v_uint8x16 b; v_reg<typename V_TypeTraits<_Tp>::abs_type, n> b = v_reg<typename V_TypeTraits<_Tp>::abs_type, n>::zero();
b = v_reinterpret_as_u8(a); for( int i = 0; i < n*sizeof(_Tp); i++ )
for( int i = 0; i < v_uint8x16::nlanes; i++ ) b.s[i/sizeof(_Tp)] += popCountTable[v_reinterpret_as_u8(a).s[i]];
{ return b;
b.s[i] = popCountTable[b.s[i]];
}
v_uint32x4 c;
for( int i = 0; i < v_uint32x4::nlanes; i++ )
{
c.s[i] = b.s[i*4] + b.s[i*4+1] + b.s[i*4+2] + b.s[i*4+3];
}
return c;
} }
......
...@@ -910,6 +910,31 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32) ...@@ -910,6 +910,31 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64) OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
#endif #endif
inline unsigned v_reduce_sum(const v_uint8x16& a)
{
uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(a.val));
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
return vget_lane_u32(vpadd_u32(t1, t1), 0);
}
inline int v_reduce_sum(const v_int8x16& a)
{
int32x4_t t0 = vpaddlq_s16(vpaddlq_s8(a.val));
int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
return vget_lane_s32(vpadd_s32(t1, t1), 0);
}
inline unsigned v_reduce_sum(const v_uint16x8& a)
{
uint32x4_t t0 = vpaddlq_u16(a.val);
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
return vget_lane_u32(vpadd_u32(t1, t1), 0);
}
inline int v_reduce_sum(const v_int16x8& a)
{
int32x4_t t0 = vpaddlq_s16(a.val);
int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
return vget_lane_s32(vpadd_s32(t1, t1), 0);
}
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \ #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
inline scalartype v_reduce_##func(const _Tpvec& a) \ inline scalartype v_reduce_##func(const _Tpvec& a) \
{ \ { \
...@@ -918,12 +943,10 @@ inline scalartype v_reduce_##func(const _Tpvec& a) \ ...@@ -918,12 +943,10 @@ inline scalartype v_reduce_##func(const _Tpvec& a) \
return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \ return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
} }
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, sum, add, u16) OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned int, max, max, u16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, max, max, u16) OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned int, min, min, u16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, min, min, u16) OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, int, max, max, s16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, sum, add, s16) OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, int, min, min, s16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \ #define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
inline scalartype v_reduce_##func(const _Tpvec& a) \ inline scalartype v_reduce_##func(const _Tpvec& a) \
...@@ -942,6 +965,10 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32) ...@@ -942,6 +965,10 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32) OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32) OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
inline uint64 v_reduce_sum(const v_uint64x2& a)
{ return vget_lane_u64(vadd_u64(vget_low_u64(a.val), vget_high_u64(a.val)),0); }
inline int64 v_reduce_sum(const v_int64x2& a)
{ return vget_lane_s64(vadd_s64(vget_low_s64(a.val), vget_high_s64(a.val)),0); }
#if CV_SIMD128_64F #if CV_SIMD128_64F
inline double v_reduce_sum(const v_float64x2& a) inline double v_reduce_sum(const v_float64x2& a)
{ {
...@@ -1007,21 +1034,22 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b) ...@@ -1007,21 +1034,22 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
return vget_lane_f32(vpadd_f32(t1, t1), 0); return vget_lane_f32(vpadd_f32(t1, t1), 0);
} }
#define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \ inline v_uint8x16 v_popcount(const v_uint8x16& a)
inline v_uint32x4 v_popcount(const _Tpvec& a) \ { return v_uint8x16(vcntq_u8(a.val)); }
{ \ inline v_uint8x16 v_popcount(const v_int8x16& a)
uint8x16_t t = vcntq_u8(cast(a.val)); \ { return v_uint8x16(vcntq_u8(vreinterpretq_u8_s8(a.val))); }
uint16x8_t t0 = vpaddlq_u8(t); /* 16 -> 8 */ \ inline v_uint16x8 v_popcount(const v_uint16x8& a)
uint32x4_t t1 = vpaddlq_u16(t0); /* 8 -> 4 */ \ { return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u16(a.val)))); }
return v_uint32x4(t1); \ inline v_uint16x8 v_popcount(const v_int16x8& a)
} { return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s16(a.val)))); }
inline v_uint32x4 v_popcount(const v_uint32x4& a)
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint8x16, OPENCV_HAL_NOP) { return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u32(a.val))))); }
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint16x8, vreinterpretq_u8_u16) inline v_uint32x4 v_popcount(const v_int32x4& a)
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint32x4, vreinterpretq_u8_u32) { return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s32(a.val))))); }
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int8x16, vreinterpretq_u8_s8) inline v_uint64x2 v_popcount(const v_uint64x2& a)
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int16x8, vreinterpretq_u8_s16) { return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(a.val)))))); }
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int32x4, vreinterpretq_u8_s32) inline v_uint64x2 v_popcount(const v_int64x2& a)
{ return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s64(a.val)))))); }
inline int v_signmask(const v_uint8x16& a) inline int v_signmask(const v_uint8x16& a)
{ {
......
...@@ -302,8 +302,8 @@ inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps) ...@@ -302,8 +302,8 @@ inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)
template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \ template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
{ return _Tpvec(cast(a.val)); } { return _Tpvec(cast(a.val)); }
OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP) OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, schar, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP) OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, schar, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP) OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP) OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP) OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
...@@ -1393,6 +1393,41 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \ ...@@ -1393,6 +1393,41 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps) OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd) OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
inline unsigned v_reduce_sum(const v_uint8x16& a)
{
__m128i half = _mm_sad_epu8(a.val, _mm_setzero_si128());
return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
}
inline int v_reduce_sum(const v_int8x16& a)
{
__m128i half = _mm_set1_epi8((schar)-128);
half = _mm_sad_epu8(_mm_xor_si128(a.val, half), _mm_setzero_si128());
return _mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half))) - 2048;
}
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(func) \
inline schar v_reduce_##func(const v_int8x16& a) \
{ \
__m128i val = a.val; \
__m128i smask = _mm_set1_epi8((schar)-128); \
val = _mm_xor_si128(val, smask); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
return (schar)_mm_cvtsi128_si32(val) ^ (schar)-128; \
} \
inline uchar v_reduce_##func(const v_uint8x16& a) \
{ \
__m128i val = a.val; \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
return (uchar)_mm_cvtsi128_si32(val); \
}
OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(max)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(min)
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \ #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
inline scalartype v_reduce_##func(const v_##_Tpvec& a) \ inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
{ \ { \
...@@ -1412,26 +1447,8 @@ inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \ ...@@ -1412,26 +1447,8 @@ inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \
val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \ val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^ sbit); \ return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^ sbit); \
} }
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(_Tpvec, scalartype, suffix) \
inline scalartype v_reduce_sum(const v_##_Tpvec& a) \
{ \
__m128i val = a.val; \
val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 8)); \
val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 4)); \
val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 2)); \
return (scalartype)_mm_cvtsi128_si32(val); \
} \
inline unsigned scalartype v_reduce_sum(const v_u##_Tpvec& a) \
{ \
__m128i val = a.val; \
val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 8)); \
val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 4)); \
val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 2)); \
return (unsigned scalartype)_mm_cvtsi128_si32(val); \
}
OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768) OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768) OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(int16x8, short, 16)
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \ #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \
inline scalartype v_reduce_sum(const _Tpvec& a) \ inline scalartype v_reduce_sum(const _Tpvec& a) \
...@@ -1456,6 +1473,23 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV ...@@ -1456,6 +1473,23 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32) OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32) OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
inline int v_reduce_sum(const v_int16x8& a)
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
inline unsigned v_reduce_sum(const v_uint16x8& a)
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
inline uint64 v_reduce_sum(const v_uint64x2& a)
{
uint64 CV_DECL_ALIGNED(32) idx[2];
v_store_aligned(idx, a);
return idx[0] + idx[1];
}
inline int64 v_reduce_sum(const v_int64x2& a)
{
int64 CV_DECL_ALIGNED(32) idx[2];
v_store_aligned(idx, a);
return idx[0] + idx[1];
}
inline double v_reduce_sum(const v_float64x2& a) inline double v_reduce_sum(const v_float64x2& a)
{ {
double CV_DECL_ALIGNED(32) idx[2]; double CV_DECL_ALIGNED(32) idx[2];
...@@ -1520,27 +1554,42 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b) ...@@ -1520,27 +1554,42 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
return v_reduce_sum(v_absdiff(a, b)); return v_reduce_sum(v_absdiff(a, b));
} }
#define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \ inline v_uint8x16 v_popcount(const v_uint8x16& a)
inline v_uint32x4 v_popcount(const _Tpvec& a) \ {
{ \ __m128i m1 = _mm_set1_epi32(0x55555555);
__m128i m1 = _mm_set1_epi32(0x55555555); \ __m128i m2 = _mm_set1_epi32(0x33333333);
__m128i m2 = _mm_set1_epi32(0x33333333); \ __m128i m4 = _mm_set1_epi32(0x0f0f0f0f);
__m128i m4 = _mm_set1_epi32(0x0f0f0f0f); \ __m128i p = a.val;
__m128i p = a.val; \ p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1));
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); \ p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2));
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); \ p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4));
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); \ return v_uint8x16(p);
p = _mm_adds_epi8(p, _mm_srli_si128(p, 1)); \ }
p = _mm_adds_epi8(p, _mm_srli_si128(p, 2)); \ inline v_uint16x8 v_popcount(const v_uint16x8& a)
return v_uint32x4(_mm_and_si128(p, _mm_set1_epi32(0x000000ff))); \ {
} v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
p += v_rotate_right<1>(p);
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint8x16) return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint16x8) }
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint32x4) inline v_uint32x4 v_popcount(const v_uint32x4& a)
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int8x16) {
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int16x8) v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int32x4) p += v_rotate_right<1>(p);
p += v_rotate_right<2>(p);
return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
}
inline v_uint64x2 v_popcount(const v_uint64x2& a)
{
return v_uint64x2(_mm_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm_setzero_si128()));
}
inline v_uint8x16 v_popcount(const v_int8x16& a)
{ return v_popcount(v_reinterpret_as_u8(a)); }
inline v_uint16x8 v_popcount(const v_int16x8& a)
{ return v_popcount(v_reinterpret_as_u16(a)); }
inline v_uint32x4 v_popcount(const v_int32x4& a)
{ return v_popcount(v_reinterpret_as_u32(a)); }
inline v_uint64x2 v_popcount(const v_int64x2& a)
{ return v_popcount(v_reinterpret_as_u64(a)); }
#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \ #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
inline int v_signmask(const _Tpvec& a) \ inline int v_signmask(const _Tpvec& a) \
......
...@@ -692,15 +692,27 @@ inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b) ...@@ -692,15 +692,27 @@ inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
////////// Reduce and mask ///////// ////////// Reduce and mask /////////
/** Reduce **/ /** Reduce **/
inline short v_reduce_sum(const v_int16x8& a) inline uint v_reduce_sum(const v_uint8x16& a)
{
const vec_uint4 zero4 = vec_uint4_z;
vec_uint4 sum4 = vec_sum4s(a.val, zero4);
return (uint)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3);
}
inline int v_reduce_sum(const v_int8x16& a)
{
const vec_int4 zero4 = vec_int4_z;
vec_int4 sum4 = vec_sum4s(a.val, zero4);
return (int)vec_extract(vec_sums(sum4, zero4), 3);
}
inline int v_reduce_sum(const v_int16x8& a)
{ {
const vec_int4 zero = vec_int4_z; const vec_int4 zero = vec_int4_z;
return saturate_cast<short>(vec_extract(vec_sums(vec_sum4s(a.val, zero), zero), 3)); return saturate_cast<int>(vec_extract(vec_sums(vec_sum4s(a.val, zero), zero), 3));
} }
inline ushort v_reduce_sum(const v_uint16x8& a) inline uint v_reduce_sum(const v_uint16x8& a)
{ {
const vec_int4 v4 = vec_int4_c(vec_unpackhu(vec_adds(a.val, vec_sld(a.val, a.val, 8)))); const vec_int4 v4 = vec_int4_c(vec_unpackhu(vec_adds(a.val, vec_sld(a.val, a.val, 8))));
return saturate_cast<ushort>(vec_extract(vec_sums(v4, vec_int4_z), 3)); return saturate_cast<uint>(vec_extract(vec_sums(v4, vec_int4_z), 3));
} }
#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(_Tpvec, _Tpvec2, scalartype, suffix, func) \ #define OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(_Tpvec, _Tpvec2, scalartype, suffix, func) \
...@@ -719,6 +731,14 @@ OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, sum, vec_add) ...@@ -719,6 +731,14 @@ OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, sum, vec_add)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, max, vec_max) OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, max, vec_max)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, min, vec_min) OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, min, vec_min)
inline uint64 v_reduce_sum(const v_uint64x2& a)
{
return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
}
inline int64 v_reduce_sum(const v_int64x2& a)
{
return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
}
inline double v_reduce_sum(const v_float64x2& a) inline double v_reduce_sum(const v_float64x2& a)
{ {
return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0); return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
...@@ -736,6 +756,19 @@ OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, min, vec_min) ...@@ -736,6 +756,19 @@ OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, min, vec_min)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, max, vec_max) OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, max, vec_max)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, min, vec_min) OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, min, vec_min)
#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(_Tpvec, _Tpvec2, scalartype, suffix, func) \
inline scalartype v_reduce_##suffix(const _Tpvec& a) \
{ \
_Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \
rs = func(rs, vec_sld(rs, rs, 4)); \
rs = func(rs, vec_sld(rs, rs, 2)); \
return vec_extract(func(rs, vec_sld(rs, rs, 1)), 0); \
}
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint8x16, vec_uchar16, uchar, max, vec_max)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint8x16, vec_uchar16, uchar, min, vec_min)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int8x16, vec_char16, schar, max, vec_max)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int8x16, vec_char16, schar, min, vec_min)
inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b, inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
const v_float32x4& c, const v_float32x4& d) const v_float32x4& c, const v_float32x4& d)
{ {
...@@ -792,9 +825,22 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b) ...@@ -792,9 +825,22 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
} }
/** Popcount **/ /** Popcount **/
template<typename _Tpvec> inline v_uint8x16 v_popcount(const v_uint8x16& a)
inline v_uint32x4 v_popcount(const _Tpvec& a) { return v_uint8x16(vec_popcntu(a.val)); }
{ return v_uint32x4(vec_popcntu(vec_uint4_c(a.val))); } inline v_uint8x16 v_popcount(const v_int8x16& a)
{ return v_uint8x16(vec_popcntu(a.val)); }
inline v_uint16x8 v_popcount(const v_uint16x8& a)
{ return v_uint16x8(vec_popcntu(a.val)); }
inline v_uint16x8 v_popcount(const v_int16x8& a)
{ return v_uint16x8(vec_popcntu(a.val)); }
inline v_uint32x4 v_popcount(const v_uint32x4& a)
{ return v_uint32x4(vec_popcntu(a.val)); }
inline v_uint32x4 v_popcount(const v_int32x4& a)
{ return v_uint32x4(vec_popcntu(a.val)); }
inline v_uint64x2 v_popcount(const v_uint64x2& a)
{ return v_uint64x2(vec_popcntu(a.val)); }
inline v_uint64x2 v_popcount(const v_int64x2& a)
{ return v_uint64x2(vec_popcntu(a.val)); }
/** Mask **/ /** Mask **/
inline int v_signmask(const v_uint8x16& a) inline int v_signmask(const v_uint8x16& a)
......
...@@ -32,28 +32,15 @@ int normHamming(const uchar* a, int n) ...@@ -32,28 +32,15 @@ int normHamming(const uchar* a, int n)
int i = 0; int i = 0;
int result = 0; int result = 0;
#if CV_AVX2
{
__m256i _r0 = _mm256_setzero_si256();
__m256i _0 = _mm256_setzero_si256();
__m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
__m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
for(; i <= n - 32; i+= 32)
{
__m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i));
__m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_a0, _popcnt_mask)); #if CV_SIMD && CV_SIMD_WIDTH > 16
__m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table, {
_mm256_and_si256(_mm256_srli_epi16(_a0, 4), _popcnt_mask)); v_uint64 t = vx_setzero_u64();
for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
_r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1))); t += v_popcount(v_reinterpret_as_u64(vx_load(a + i)));
} result = (int)v_reduce_sum(t);
_r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
} }
#endif // CV_AVX2 #endif
#if CV_POPCNT #if CV_POPCNT
{ {
...@@ -68,18 +55,14 @@ int normHamming(const uchar* a, int n) ...@@ -68,18 +55,14 @@ int normHamming(const uchar* a, int n)
result += CV_POPCNT_U32(*(uint*)(a + i)); result += CV_POPCNT_U32(*(uint*)(a + i));
} }
} }
#endif // CV_POPCNT #elif CV_SIMD
#if CV_SIMD128
{ {
v_uint32x4 t = v_setzero_u32(); v_uint64x2 t = v_setzero_u64();
for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes) for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
{ t += v_popcount(v_reinterpret_as_u64(v_load(a + i)));
t += v_popcount(v_load(a + i)); result += (int)v_reduce_sum(t);
}
result += v_reduce_sum(t);
} }
#endif // CV_SIMD128 #endif
#if CV_ENABLE_UNROLLED #if CV_ENABLE_UNROLLED
for(; i <= n - 4; i += 4) for(; i <= n - 4; i += 4)
{ {
...@@ -100,31 +83,15 @@ int normHamming(const uchar* a, const uchar* b, int n) ...@@ -100,31 +83,15 @@ int normHamming(const uchar* a, const uchar* b, int n)
int i = 0; int i = 0;
int result = 0; int result = 0;
#if CV_AVX2
{
__m256i _r0 = _mm256_setzero_si256();
__m256i _0 = _mm256_setzero_si256();
__m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
__m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
for(; i <= n - 32; i+= 32)
{
__m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i));
__m256i _b0 = _mm256_loadu_si256((const __m256i*)(b + i));
__m256i _xor = _mm256_xor_si256(_a0, _b0); #if CV_SIMD && CV_SIMD_WIDTH > 16
{
__m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_xor, _popcnt_mask)); v_uint64 t = vx_setzero_u64();
__m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table, for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
_mm256_and_si256(_mm256_srli_epi16(_xor, 4), _popcnt_mask)); t += v_popcount(v_reinterpret_as_u64(vx_load(a + i) ^ vx_load(b + i)));
result += (int)v_reduce_sum(t);
_r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1)));
}
_r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
} }
#endif // CV_AVX2 #endif
#if CV_POPCNT #if CV_POPCNT
{ {
...@@ -139,18 +106,14 @@ int normHamming(const uchar* a, const uchar* b, int n) ...@@ -139,18 +106,14 @@ int normHamming(const uchar* a, const uchar* b, int n)
result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i)); result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i));
} }
} }
#endif // CV_POPCNT #elif CV_SIMD
#if CV_SIMD128
{ {
v_uint32x4 t = v_setzero_u32(); v_uint64x2 t = v_setzero_u64();
for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes) for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
{ t += v_popcount(v_reinterpret_as_u64(vx_load(a + i) ^ vx_load(b + i)));
t += v_popcount(v_load(a + i) ^ v_load(b + i)); result += (int)v_reduce_sum(t);
}
result += v_reduce_sum(t);
} }
#endif // CV_SIMD128 #endif
#if CV_ENABLE_UNROLLED #if CV_ENABLE_UNROLLED
for(; i <= n - 4; i += 4) for(; i <= n - 4; i += 4)
{ {
......
...@@ -686,18 +686,24 @@ template<typename R> struct TheTest ...@@ -686,18 +686,24 @@ template<typename R> struct TheTest
TheTest & test_popcount() TheTest & test_popcount()
{ {
typedef typename V_RegTraits<R>::u_reg Ru;
static unsigned popcountTable[] = { static unsigned popcountTable[] = {
0, 1, 2, 4, 5, 7, 9, 12, 13, 15, 17, 20, 22, 25, 28, 32, 33, 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, //0x00-0x0f
35, 37, 40, 42, 45, 48, 52, 54, 57, 60, 64, 67, 71, 75, 80, 81, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, //0x10-0x1f
83, 85, 88, 90, 93, 96, 100, 102, 105, 108, 112, 115, 119, 123, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, //0x20-0x2f
128, 130, 133, 136, 140, 143, 147, 151, 156, 159, 163, 167, 172, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, //0x30-0x3f
176, 181, 186, 192, 193 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, //0x40-0x4f
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, //0x50-0x5f
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, //0x60-0x6f
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, //0x70-0x7f
1 //0x80
}; };
Data<R> dataA; Data<R> dataA;
R a = dataA; R a = dataA;
unsigned resB = (unsigned)v_reduce_sum(v_popcount(a)); Data<Ru> resB = v_popcount(a);
EXPECT_EQ(popcountTable[R::nlanes], resB); for (int i = 0; i < Ru::nlanes; ++i)
EXPECT_EQ(popcountTable[i + 1], resB[i]);
return *this; return *this;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment