Commit 84fd8190 authored by Alexander Alekhin's avatar Alexander Alekhin

Merge pull request #14232 from terfendail:popcount_rework

parents 88996323 7a55f2af
......@@ -1015,6 +1015,34 @@ OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd
////////// Reduce and mask /////////
/** Reduce **/
inline unsigned v_reduce_sum(const v_uint8x32& a)
{
__m256i half = _mm256_sad_epu8(a.val, _mm256_setzero_si256());
__m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter)));
}
inline int v_reduce_sum(const v_int8x32& a)
{
__m256i half = _mm256_sad_epu8(_mm256_xor_si256(a.val, _mm256_set1_epi8((schar)-128)), _mm256_setzero_si256());
__m128i quarter = _mm_add_epi32(_v256_extract_low(half), _v256_extract_high(half));
return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(quarter, _mm_unpackhi_epi64(quarter, quarter))) - 4096;
}
#define OPENCV_HAL_IMPL_AVX_REDUCE_32(_Tpvec, sctype, func, intrin) \
inline sctype v_reduce_##func(const _Tpvec& a) \
{ \
__m128i val = intrin(_v256_extract_low(a.val), _v256_extract_high(a.val)); \
val = intrin(val, _mm_srli_si128(val,8)); \
val = intrin(val, _mm_srli_si128(val,4)); \
val = intrin(val, _mm_srli_si128(val,2)); \
val = intrin(val, _mm_srli_si128(val,1)); \
return (sctype)_mm_cvtsi128_si32(val); \
}
OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar, min, _mm_min_epu8)
OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32, schar, min, _mm_min_epi8)
OPENCV_HAL_IMPL_AVX_REDUCE_32(v_uint8x32, uchar, max, _mm_max_epu8)
OPENCV_HAL_IMPL_AVX_REDUCE_32(v_int8x32, schar, max, _mm_max_epi8)
#define OPENCV_HAL_IMPL_AVX_REDUCE_16(_Tpvec, sctype, func, intrin) \
inline sctype v_reduce_##func(const _Tpvec& a) \
{ \
......@@ -1062,31 +1090,6 @@ OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8, int, max, _mm_max_epi32)
OPENCV_HAL_IMPL_AVX_REDUCE_FLT(min, _mm_min_ps)
OPENCV_HAL_IMPL_AVX_REDUCE_FLT(max, _mm_max_ps)
inline ushort v_reduce_sum(const v_uint16x16& a)
{
__m128i a0 = _v256_extract_low(a.val);
__m128i a1 = _v256_extract_high(a.val);
__m128i s0 = _mm_adds_epu16(a0, a1);
s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 2));
return (ushort)_mm_cvtsi128_si32(s0);
}
inline short v_reduce_sum(const v_int16x16& a)
{
__m256i s0 = _mm256_hadds_epi16(a.val, a.val);
s0 = _mm256_hadds_epi16(s0, s0);
s0 = _mm256_hadds_epi16(s0, s0);
__m128i s1 = _v256_extract_high(s0);
s1 = _mm_adds_epi16(_v256_extract_low(s0), s1);
return (short)_mm_cvtsi128_si32(s1);
}
inline int v_reduce_sum(const v_int32x8& a)
{
__m256i s0 = _mm256_hadd_epi32(a.val, a.val);
......@@ -1101,6 +1104,11 @@ inline int v_reduce_sum(const v_int32x8& a)
inline unsigned v_reduce_sum(const v_uint32x8& a)
{ return v_reduce_sum(v_reinterpret_as_s32(a)); }
inline int v_reduce_sum(const v_int16x16& a)
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
inline unsigned v_reduce_sum(const v_uint16x16& a)
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
inline float v_reduce_sum(const v_float32x8& a)
{
__m256 s0 = _mm256_hadd_ps(a.val, a.val);
......@@ -1112,6 +1120,18 @@ inline float v_reduce_sum(const v_float32x8& a)
return _mm_cvtss_f32(s1);
}
inline uint64 v_reduce_sum(const v_uint64x4& a)
{
uint64 CV_DECL_ALIGNED(32) idx[2];
_mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
return idx[0] + idx[1];
}
inline int64 v_reduce_sum(const v_int64x4& a)
{
int64 CV_DECL_ALIGNED(32) idx[2];
_mm_store_si128((__m128i*)idx, _mm_add_epi64(_v256_extract_low(a.val), _v256_extract_high(a.val)));
return idx[0] + idx[1];
}
inline double v_reduce_sum(const v_float64x4& a)
{
__m256d s0 = _mm256_hadd_pd(a.val, a.val);
......@@ -1166,26 +1186,39 @@ inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
}
/** Popcount **/
#define OPENCV_HAL_IMPL_AVX_POPCOUNT(_Tpvec) \
inline v_uint32x8 v_popcount(const _Tpvec& a) \
{ \
const v_uint32x8 m1 = v256_setall_u32(0x55555555); \
const v_uint32x8 m2 = v256_setall_u32(0x33333333); \
const v_uint32x8 m4 = v256_setall_u32(0x0f0f0f0f); \
v_uint32x8 p = v_reinterpret_as_u32(a); \
p = ((p >> 1) & m1) + (p & m1); \
p = ((p >> 2) & m2) + (p & m2); \
p = ((p >> 4) & m4) + (p & m4); \
p.val = _mm256_sad_epu8(p.val, _mm256_setzero_si256()); \
return p; \
}
OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint8x32)
OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int8x32)
OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint16x16)
OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int16x16)
OPENCV_HAL_IMPL_AVX_POPCOUNT(v_uint32x8)
OPENCV_HAL_IMPL_AVX_POPCOUNT(v_int32x8)
inline v_uint8x32 v_popcount(const v_uint8x32& a)
{
__m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
__m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
return v_uint8x32(_mm256_add_epi8(_mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256( a.val , _popcnt_mask)),
_mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_mm256_srli_epi16(a.val, 4), _popcnt_mask))));
}
inline v_uint16x16 v_popcount(const v_uint16x16& a)
{
v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
p += v_rotate_right<1>(p);
return v_reinterpret_as_u16(p) & v256_setall_u16(0x00ff);
}
inline v_uint32x8 v_popcount(const v_uint32x8& a)
{
v_uint8x32 p = v_popcount(v_reinterpret_as_u8(a));
p += v_rotate_right<1>(p);
p += v_rotate_right<2>(p);
return v_reinterpret_as_u32(p) & v256_setall_u32(0x000000ff);
}
inline v_uint64x4 v_popcount(const v_uint64x4& a)
{
return v_uint64x4(_mm256_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm256_setzero_si256()));
}
inline v_uint8x32 v_popcount(const v_int8x32& a)
{ return v_popcount(v_reinterpret_as_u8(a)); }
inline v_uint16x16 v_popcount(const v_int16x16& a)
{ return v_popcount(v_reinterpret_as_u16(a)); }
inline v_uint32x8 v_popcount(const v_int32x8& a)
{ return v_popcount(v_reinterpret_as_u32(a)); }
inline v_uint64x4 v_popcount(const v_int64x4& a)
{ return v_popcount(v_reinterpret_as_u64(a)); }
/** Mask **/
inline int v_signmask(const v_int8x32& a)
......
......@@ -603,27 +603,20 @@ static const unsigned char popCountTable[] =
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
};
/** @brief Count the 1 bits in the vector and return 4 values
/** @brief Count the 1 bits in the vector lanes and return result as corresponding unsigned type
Scheme:
@code
{A1 A2 A3 ...} => popcount(A1)
{A1 A2 A3 ...} => {popcount(A1), popcount(A2), popcount(A3), ...}
@endcode
Any types but result will be in v_uint32x4*/
template<typename _Tp, int n> inline v_uint32x4 v_popcount(const v_reg<_Tp, n>& a)
For all integer types. */
template<typename _Tp, int n>
inline v_reg<typename V_TypeTraits<_Tp>::abs_type, n> v_popcount(const v_reg<_Tp, n>& a)
{
v_uint8x16 b;
b = v_reinterpret_as_u8(a);
for( int i = 0; i < v_uint8x16::nlanes; i++ )
{
b.s[i] = popCountTable[b.s[i]];
}
v_uint32x4 c;
for( int i = 0; i < v_uint32x4::nlanes; i++ )
{
c.s[i] = b.s[i*4] + b.s[i*4+1] + b.s[i*4+2] + b.s[i*4+3];
}
return c;
v_reg<typename V_TypeTraits<_Tp>::abs_type, n> b = v_reg<typename V_TypeTraits<_Tp>::abs_type, n>::zero();
for( int i = 0; i < n*sizeof(_Tp); i++ )
b.s[i/sizeof(_Tp)] += popCountTable[v_reinterpret_as_u8(a).s[i]];
return b;
}
......
......@@ -910,6 +910,31 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
#endif
inline unsigned v_reduce_sum(const v_uint8x16& a)
{
uint32x4_t t0 = vpaddlq_u16(vpaddlq_u8(a.val));
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
return vget_lane_u32(vpadd_u32(t1, t1), 0);
}
inline int v_reduce_sum(const v_int8x16& a)
{
int32x4_t t0 = vpaddlq_s16(vpaddlq_s8(a.val));
int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
return vget_lane_s32(vpadd_s32(t1, t1), 0);
}
inline unsigned v_reduce_sum(const v_uint16x8& a)
{
uint32x4_t t0 = vpaddlq_u16(a.val);
uint32x2_t t1 = vpadd_u32(vget_low_u32(t0), vget_high_u32(t0));
return vget_lane_u32(vpadd_u32(t1, t1), 0);
}
inline int v_reduce_sum(const v_int16x8& a)
{
int32x4_t t0 = vpaddlq_s16(a.val);
int32x2_t t1 = vpadd_s32(vget_low_s32(t0), vget_high_s32(t0));
return vget_lane_s32(vpadd_s32(t1, t1), 0);
}
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
inline scalartype v_reduce_##func(const _Tpvec& a) \
{ \
......@@ -918,12 +943,10 @@ inline scalartype v_reduce_##func(const _Tpvec& a) \
return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
}
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, sum, add, u16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, max, max, u16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned short, min, min, u16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, sum, add, s16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, max, max, s16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, short, min, min, s16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned int, max, max, u16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_uint16x8, uint16x4, unsigned int, min, min, u16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, int, max, max, s16)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(v_int16x8, int16x4, int, min, min, s16)
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
inline scalartype v_reduce_##func(const _Tpvec& a) \
......@@ -942,6 +965,10 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, max, max, f32)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, min, min, f32)
inline uint64 v_reduce_sum(const v_uint64x2& a)
{ return vget_lane_u64(vadd_u64(vget_low_u64(a.val), vget_high_u64(a.val)),0); }
inline int64 v_reduce_sum(const v_int64x2& a)
{ return vget_lane_s64(vadd_s64(vget_low_s64(a.val), vget_high_s64(a.val)),0); }
#if CV_SIMD128_64F
inline double v_reduce_sum(const v_float64x2& a)
{
......@@ -1007,21 +1034,22 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
return vget_lane_f32(vpadd_f32(t1, t1), 0);
}
#define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \
inline v_uint32x4 v_popcount(const _Tpvec& a) \
{ \
uint8x16_t t = vcntq_u8(cast(a.val)); \
uint16x8_t t0 = vpaddlq_u8(t); /* 16 -> 8 */ \
uint32x4_t t1 = vpaddlq_u16(t0); /* 8 -> 4 */ \
return v_uint32x4(t1); \
}
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint8x16, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint16x8, vreinterpretq_u8_u16)
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_uint32x4, vreinterpretq_u8_u32)
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int8x16, vreinterpretq_u8_s8)
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int16x8, vreinterpretq_u8_s16)
OPENCV_HAL_IMPL_NEON_POPCOUNT(v_int32x4, vreinterpretq_u8_s32)
inline v_uint8x16 v_popcount(const v_uint8x16& a)
{ return v_uint8x16(vcntq_u8(a.val)); }
inline v_uint8x16 v_popcount(const v_int8x16& a)
{ return v_uint8x16(vcntq_u8(vreinterpretq_u8_s8(a.val))); }
inline v_uint16x8 v_popcount(const v_uint16x8& a)
{ return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u16(a.val)))); }
inline v_uint16x8 v_popcount(const v_int16x8& a)
{ return v_uint16x8(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s16(a.val)))); }
inline v_uint32x4 v_popcount(const v_uint32x4& a)
{ return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u32(a.val))))); }
inline v_uint32x4 v_popcount(const v_int32x4& a)
{ return v_uint32x4(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s32(a.val))))); }
inline v_uint64x2 v_popcount(const v_uint64x2& a)
{ return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(a.val)))))); }
inline v_uint64x2 v_popcount(const v_int64x2& a)
{ return v_uint64x2(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vcntq_u8(vreinterpretq_u8_s64(a.val)))))); }
inline int v_signmask(const v_uint8x16& a)
{
......
......@@ -302,8 +302,8 @@ inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)
template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
{ return _Tpvec(cast(a.val)); }
OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, schar, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, schar, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
......@@ -1393,6 +1393,41 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
inline unsigned v_reduce_sum(const v_uint8x16& a)
{
__m128i half = _mm_sad_epu8(a.val, _mm_setzero_si128());
return (unsigned)_mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half)));
}
inline int v_reduce_sum(const v_int8x16& a)
{
__m128i half = _mm_set1_epi8((schar)-128);
half = _mm_sad_epu8(_mm_xor_si128(a.val, half), _mm_setzero_si128());
return _mm_cvtsi128_si32(_mm_add_epi32(half, _mm_unpackhi_epi64(half, half))) - 2048;
}
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(func) \
inline schar v_reduce_##func(const v_int8x16& a) \
{ \
__m128i val = a.val; \
__m128i smask = _mm_set1_epi8((schar)-128); \
val = _mm_xor_si128(val, smask); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
return (schar)_mm_cvtsi128_si32(val) ^ (schar)-128; \
} \
inline uchar v_reduce_##func(const v_uint8x16& a) \
{ \
__m128i val = a.val; \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
return (uchar)_mm_cvtsi128_si32(val); \
}
OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(max)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(min)
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
{ \
......@@ -1412,26 +1447,8 @@ inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \
val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^ sbit); \
}
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(_Tpvec, scalartype, suffix) \
inline scalartype v_reduce_sum(const v_##_Tpvec& a) \
{ \
__m128i val = a.val; \
val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 8)); \
val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 4)); \
val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 2)); \
return (scalartype)_mm_cvtsi128_si32(val); \
} \
inline unsigned scalartype v_reduce_sum(const v_u##_Tpvec& a) \
{ \
__m128i val = a.val; \
val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 8)); \
val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 4)); \
val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 2)); \
return (unsigned scalartype)_mm_cvtsi128_si32(val); \
}
OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, max, epi16, (short)-32768)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(int16x8, short, min, epi16, (short)-32768)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(int16x8, short, 16)
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \
inline scalartype v_reduce_sum(const _Tpvec& a) \
......@@ -1456,6 +1473,23 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_int32x4, int, __m128i, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP, si128_si32)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_si128, _mm_castsi128_ps, ss_f32)
inline int v_reduce_sum(const v_int16x8& a)
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
inline unsigned v_reduce_sum(const v_uint16x8& a)
{ return v_reduce_sum(v_expand_low(a) + v_expand_high(a)); }
inline uint64 v_reduce_sum(const v_uint64x2& a)
{
uint64 CV_DECL_ALIGNED(32) idx[2];
v_store_aligned(idx, a);
return idx[0] + idx[1];
}
inline int64 v_reduce_sum(const v_int64x2& a)
{
int64 CV_DECL_ALIGNED(32) idx[2];
v_store_aligned(idx, a);
return idx[0] + idx[1];
}
inline double v_reduce_sum(const v_float64x2& a)
{
double CV_DECL_ALIGNED(32) idx[2];
......@@ -1520,27 +1554,42 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
return v_reduce_sum(v_absdiff(a, b));
}
#define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \
inline v_uint32x4 v_popcount(const _Tpvec& a) \
{ \
__m128i m1 = _mm_set1_epi32(0x55555555); \
__m128i m2 = _mm_set1_epi32(0x33333333); \
__m128i m4 = _mm_set1_epi32(0x0f0f0f0f); \
__m128i p = a.val; \
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); \
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); \
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); \
p = _mm_adds_epi8(p, _mm_srli_si128(p, 1)); \
p = _mm_adds_epi8(p, _mm_srli_si128(p, 2)); \
return v_uint32x4(_mm_and_si128(p, _mm_set1_epi32(0x000000ff))); \
}
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint8x16)
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint16x8)
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_uint32x4)
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int8x16)
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int16x8)
OPENCV_HAL_IMPL_SSE_POPCOUNT(v_int32x4)
inline v_uint8x16 v_popcount(const v_uint8x16& a)
{
__m128i m1 = _mm_set1_epi32(0x55555555);
__m128i m2 = _mm_set1_epi32(0x33333333);
__m128i m4 = _mm_set1_epi32(0x0f0f0f0f);
__m128i p = a.val;
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1));
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2));
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4));
return v_uint8x16(p);
}
inline v_uint16x8 v_popcount(const v_uint16x8& a)
{
v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
p += v_rotate_right<1>(p);
return v_reinterpret_as_u16(p) & v_setall_u16(0x00ff);
}
inline v_uint32x4 v_popcount(const v_uint32x4& a)
{
v_uint8x16 p = v_popcount(v_reinterpret_as_u8(a));
p += v_rotate_right<1>(p);
p += v_rotate_right<2>(p);
return v_reinterpret_as_u32(p) & v_setall_u32(0x000000ff);
}
inline v_uint64x2 v_popcount(const v_uint64x2& a)
{
return v_uint64x2(_mm_sad_epu8(v_popcount(v_reinterpret_as_u8(a)).val, _mm_setzero_si128()));
}
inline v_uint8x16 v_popcount(const v_int8x16& a)
{ return v_popcount(v_reinterpret_as_u8(a)); }
inline v_uint16x8 v_popcount(const v_int16x8& a)
{ return v_popcount(v_reinterpret_as_u16(a)); }
inline v_uint32x4 v_popcount(const v_int32x4& a)
{ return v_popcount(v_reinterpret_as_u32(a)); }
inline v_uint64x2 v_popcount(const v_int64x2& a)
{ return v_popcount(v_reinterpret_as_u64(a)); }
#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
inline int v_signmask(const _Tpvec& a) \
......
......@@ -692,15 +692,27 @@ inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
////////// Reduce and mask /////////
/** Reduce **/
inline short v_reduce_sum(const v_int16x8& a)
inline uint v_reduce_sum(const v_uint8x16& a)
{
const vec_uint4 zero4 = vec_uint4_z;
vec_uint4 sum4 = vec_sum4s(a.val, zero4);
return (uint)vec_extract(vec_sums(vec_int4_c(sum4), vec_int4_c(zero4)), 3);
}
inline int v_reduce_sum(const v_int8x16& a)
{
const vec_int4 zero4 = vec_int4_z;
vec_int4 sum4 = vec_sum4s(a.val, zero4);
return (int)vec_extract(vec_sums(sum4, zero4), 3);
}
inline int v_reduce_sum(const v_int16x8& a)
{
const vec_int4 zero = vec_int4_z;
return saturate_cast<short>(vec_extract(vec_sums(vec_sum4s(a.val, zero), zero), 3));
return saturate_cast<int>(vec_extract(vec_sums(vec_sum4s(a.val, zero), zero), 3));
}
inline ushort v_reduce_sum(const v_uint16x8& a)
inline uint v_reduce_sum(const v_uint16x8& a)
{
const vec_int4 v4 = vec_int4_c(vec_unpackhu(vec_adds(a.val, vec_sld(a.val, a.val, 8))));
return saturate_cast<ushort>(vec_extract(vec_sums(v4, vec_int4_z), 3));
return saturate_cast<uint>(vec_extract(vec_sums(v4, vec_int4_z), 3));
}
#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(_Tpvec, _Tpvec2, scalartype, suffix, func) \
......@@ -719,6 +731,14 @@ OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, sum, vec_add)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, max, vec_max)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, min, vec_min)
inline uint64 v_reduce_sum(const v_uint64x2& a)
{
return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
}
inline int64 v_reduce_sum(const v_int64x2& a)
{
return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
}
inline double v_reduce_sum(const v_float64x2& a)
{
return vec_extract(vec_add(a.val, vec_permi(a.val, a.val, 3)), 0);
......@@ -736,6 +756,19 @@ OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, min, vec_min)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, max, vec_max)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int16x8, vec_short8, short, min, vec_min)
#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(_Tpvec, _Tpvec2, scalartype, suffix, func) \
inline scalartype v_reduce_##suffix(const _Tpvec& a) \
{ \
_Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \
rs = func(rs, vec_sld(rs, rs, 4)); \
rs = func(rs, vec_sld(rs, rs, 2)); \
return vec_extract(func(rs, vec_sld(rs, rs, 1)), 0); \
}
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint8x16, vec_uchar16, uchar, max, vec_max)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint8x16, vec_uchar16, uchar, min, vec_min)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int8x16, vec_char16, schar, max, vec_max)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_int8x16, vec_char16, schar, min, vec_min)
inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
const v_float32x4& c, const v_float32x4& d)
{
......@@ -792,9 +825,22 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
}
/** Popcount **/
template<typename _Tpvec>
inline v_uint32x4 v_popcount(const _Tpvec& a)
{ return v_uint32x4(vec_popcntu(vec_uint4_c(a.val))); }
inline v_uint8x16 v_popcount(const v_uint8x16& a)
{ return v_uint8x16(vec_popcntu(a.val)); }
inline v_uint8x16 v_popcount(const v_int8x16& a)
{ return v_uint8x16(vec_popcntu(a.val)); }
inline v_uint16x8 v_popcount(const v_uint16x8& a)
{ return v_uint16x8(vec_popcntu(a.val)); }
inline v_uint16x8 v_popcount(const v_int16x8& a)
{ return v_uint16x8(vec_popcntu(a.val)); }
inline v_uint32x4 v_popcount(const v_uint32x4& a)
{ return v_uint32x4(vec_popcntu(a.val)); }
inline v_uint32x4 v_popcount(const v_int32x4& a)
{ return v_uint32x4(vec_popcntu(a.val)); }
inline v_uint64x2 v_popcount(const v_uint64x2& a)
{ return v_uint64x2(vec_popcntu(a.val)); }
inline v_uint64x2 v_popcount(const v_int64x2& a)
{ return v_uint64x2(vec_popcntu(a.val)); }
/** Mask **/
inline int v_signmask(const v_uint8x16& a)
......
......@@ -32,28 +32,15 @@ int normHamming(const uchar* a, int n)
int i = 0;
int result = 0;
#if CV_AVX2
{
__m256i _r0 = _mm256_setzero_si256();
__m256i _0 = _mm256_setzero_si256();
__m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
__m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
for(; i <= n - 32; i+= 32)
#if CV_SIMD && CV_SIMD_WIDTH > 16
{
__m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i));
__m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_a0, _popcnt_mask));
__m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table,
_mm256_and_si256(_mm256_srli_epi16(_a0, 4), _popcnt_mask));
_r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1)));
}
_r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
v_uint64 t = vx_setzero_u64();
for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
t += v_popcount(v_reinterpret_as_u64(vx_load(a + i)));
result = (int)v_reduce_sum(t);
}
#endif // CV_AVX2
#endif
#if CV_POPCNT
{
......@@ -68,18 +55,14 @@ int normHamming(const uchar* a, int n)
result += CV_POPCNT_U32(*(uint*)(a + i));
}
}
#endif // CV_POPCNT
#if CV_SIMD128
#elif CV_SIMD
{
v_uint32x4 t = v_setzero_u32();
v_uint64x2 t = v_setzero_u64();
for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
{
t += v_popcount(v_load(a + i));
}
result += v_reduce_sum(t);
t += v_popcount(v_reinterpret_as_u64(v_load(a + i)));
result += (int)v_reduce_sum(t);
}
#endif // CV_SIMD128
#endif
#if CV_ENABLE_UNROLLED
for(; i <= n - 4; i += 4)
{
......@@ -100,31 +83,15 @@ int normHamming(const uchar* a, const uchar* b, int n)
int i = 0;
int result = 0;
#if CV_AVX2
{
__m256i _r0 = _mm256_setzero_si256();
__m256i _0 = _mm256_setzero_si256();
__m256i _popcnt_table = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
__m256i _popcnt_mask = _mm256_set1_epi8(0x0F);
for(; i <= n - 32; i+= 32)
#if CV_SIMD && CV_SIMD_WIDTH > 16
{
__m256i _a0 = _mm256_loadu_si256((const __m256i*)(a + i));
__m256i _b0 = _mm256_loadu_si256((const __m256i*)(b + i));
__m256i _xor = _mm256_xor_si256(_a0, _b0);
__m256i _popc0 = _mm256_shuffle_epi8(_popcnt_table, _mm256_and_si256(_xor, _popcnt_mask));
__m256i _popc1 = _mm256_shuffle_epi8(_popcnt_table,
_mm256_and_si256(_mm256_srli_epi16(_xor, 4), _popcnt_mask));
_r0 = _mm256_add_epi32(_r0, _mm256_sad_epu8(_0, _mm256_add_epi8(_popc0, _popc1)));
}
_r0 = _mm256_add_epi32(_r0, _mm256_shuffle_epi32(_r0, 2));
result = _mm256_extract_epi32_(_mm256_add_epi32(_r0, _mm256_permute2x128_si256(_r0, _r0, 1)), 0);
v_uint64 t = vx_setzero_u64();
for (; i <= n - v_uint8::nlanes; i += v_uint8::nlanes)
t += v_popcount(v_reinterpret_as_u64(vx_load(a + i) ^ vx_load(b + i)));
result += (int)v_reduce_sum(t);
}
#endif // CV_AVX2
#endif
#if CV_POPCNT
{
......@@ -139,18 +106,14 @@ int normHamming(const uchar* a, const uchar* b, int n)
result += CV_POPCNT_U32(*(uint*)(a + i) ^ *(uint*)(b + i));
}
}
#endif // CV_POPCNT
#if CV_SIMD128
#elif CV_SIMD
{
v_uint32x4 t = v_setzero_u32();
v_uint64x2 t = v_setzero_u64();
for(; i <= n - v_uint8x16::nlanes; i += v_uint8x16::nlanes)
{
t += v_popcount(v_load(a + i) ^ v_load(b + i));
t += v_popcount(v_reinterpret_as_u64(vx_load(a + i) ^ vx_load(b + i)));
result += (int)v_reduce_sum(t);
}
result += v_reduce_sum(t);
}
#endif // CV_SIMD128
#endif
#if CV_ENABLE_UNROLLED
for(; i <= n - 4; i += 4)
{
......
......@@ -686,18 +686,24 @@ template<typename R> struct TheTest
TheTest & test_popcount()
{
typedef typename V_RegTraits<R>::u_reg Ru;
static unsigned popcountTable[] = {
0, 1, 2, 4, 5, 7, 9, 12, 13, 15, 17, 20, 22, 25, 28, 32, 33,
35, 37, 40, 42, 45, 48, 52, 54, 57, 60, 64, 67, 71, 75, 80, 81,
83, 85, 88, 90, 93, 96, 100, 102, 105, 108, 112, 115, 119, 123,
128, 130, 133, 136, 140, 143, 147, 151, 156, 159, 163, 167, 172,
176, 181, 186, 192, 193
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, //0x00-0x0f
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, //0x10-0x1f
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, //0x20-0x2f
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, //0x30-0x3f
1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, //0x40-0x4f
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, //0x50-0x5f
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, //0x60-0x6f
3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, //0x70-0x7f
1 //0x80
};
Data<R> dataA;
R a = dataA;
unsigned resB = (unsigned)v_reduce_sum(v_popcount(a));
EXPECT_EQ(popcountTable[R::nlanes], resB);
Data<Ru> resB = v_popcount(a);
for (int i = 0; i < Ru::nlanes; ++i)
EXPECT_EQ(popcountTable[i + 1], resB[i]);
return *this;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment