Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
84fd8190
Commit
84fd8190
authored
May 15, 2019
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #14232 from terfendail:popcount_rework
parents
88996323
7a55f2af
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
316 additions
and
198 deletions
+316
-198
intrin_avx.hpp
modules/core/include/opencv2/core/hal/intrin_avx.hpp
+78
-45
intrin_cpp.hpp
modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+9
-16
intrin_neon.hpp
modules/core/include/opencv2/core/hal/intrin_neon.hpp
+49
-21
intrin_sse.hpp
modules/core/include/opencv2/core/hal/intrin_sse.hpp
+90
-41
intrin_vsx.hpp
modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+53
-7
stat.simd.hpp
modules/core/src/stat.simd.hpp
+24
-61
test_intrin_utils.hpp
modules/core/test/test_intrin_utils.hpp
+13
-7
No files found.
modules/core/include/opencv2/core/hal/intrin_avx.hpp
View file @
84fd8190
...
...
@@ -1015,6 +1015,34 @@ OPENCV_HAL_IMPL_AVX_ROTATE_CAST(v_rotate_right, v_float64x4, _mm256_castsi256_pd
////////// Reduce and mask /////////
/** Reduce **/
inline
unsigned
v_reduce_sum
(
const
v_uint8x32
&
a
)
{
__m256i
half
=
_mm256_sad_epu8
(
a
.
val
,
_mm256_setzero_si256
());
__m128i
quarter
=
_mm_add_epi32
(
_v256_extract_low
(
half
),
_v256_extract_high
(
half
));
return
(
unsigned
)
_mm_cvtsi128_si32
(
_mm_add_epi32
(
quarter
,
_mm_unpackhi_epi64
(
quarter
,
quarter
)));
}
inline
int
v_reduce_sum
(
const
v_int8x32
&
a
)
{
__m256i
half
=
_mm256_sad_epu8
(
_mm256_xor_si256
(
a
.
val
,
_mm256_set1_epi8
((
schar
)
-
128
)),
_mm256_setzero_si256
());
__m128i
quarter
=
_mm_add_epi32
(
_v256_extract_low
(
half
),
_v256_extract_high
(
half
));
return
(
unsigned
)
_mm_cvtsi128_si32
(
_mm_add_epi32
(
quarter
,
_mm_unpackhi_epi64
(
quarter
,
quarter
)))
-
4096
;
}
#define OPENCV_HAL_IMPL_AVX_REDUCE_32(_Tpvec, sctype, func, intrin) \
inline sctype v_reduce_##func(const _Tpvec& a) \
{ \
__m128i val = intrin(_v256_extract_low(a.val), _v256_extract_high(a.val)); \
val = intrin(val, _mm_srli_si128(val,8)); \
val = intrin(val, _mm_srli_si128(val,4)); \
val = intrin(val, _mm_srli_si128(val,2)); \
val = intrin(val, _mm_srli_si128(val,1)); \
return (sctype)_mm_cvtsi128_si32(val); \
}
OPENCV_HAL_IMPL_AVX_REDUCE_32
(
v_uint8x32
,
uchar
,
min
,
_mm_min_epu8
)
OPENCV_HAL_IMPL_AVX_REDUCE_32
(
v_int8x32
,
schar
,
min
,
_mm_min_epi8
)
OPENCV_HAL_IMPL_AVX_REDUCE_32
(
v_uint8x32
,
uchar
,
max
,
_mm_max_epu8
)
OPENCV_HAL_IMPL_AVX_REDUCE_32
(
v_int8x32
,
schar
,
max
,
_mm_max_epi8
)
#define OPENCV_HAL_IMPL_AVX_REDUCE_16(_Tpvec, sctype, func, intrin) \
inline sctype v_reduce_##func(const _Tpvec& a) \
{ \
...
...
@@ -1062,31 +1090,6 @@ OPENCV_HAL_IMPL_AVX_REDUCE_8(v_int32x8, int, max, _mm_max_epi32)
OPENCV_HAL_IMPL_AVX_REDUCE_FLT
(
min
,
_mm_min_ps
)
OPENCV_HAL_IMPL_AVX_REDUCE_FLT
(
max
,
_mm_max_ps
)
inline
ushort
v_reduce_sum
(
const
v_uint16x16
&
a
)
{
__m128i
a0
=
_v256_extract_low
(
a
.
val
);
__m128i
a1
=
_v256_extract_high
(
a
.
val
);
__m128i
s0
=
_mm_adds_epu16
(
a0
,
a1
);
s0
=
_mm_adds_epu16
(
s0
,
_mm_srli_si128
(
s0
,
8
));
s0
=
_mm_adds_epu16
(
s0
,
_mm_srli_si128
(
s0
,
4
));
s0
=
_mm_adds_epu16
(
s0
,
_mm_srli_si128
(
s0
,
2
));
return
(
ushort
)
_mm_cvtsi128_si32
(
s0
);
}
inline
short
v_reduce_sum
(
const
v_int16x16
&
a
)
{
__m256i
s0
=
_mm256_hadds_epi16
(
a
.
val
,
a
.
val
);
s0
=
_mm256_hadds_epi16
(
s0
,
s0
);
s0
=
_mm256_hadds_epi16
(
s0
,
s0
);
__m128i
s1
=
_v256_extract_high
(
s0
);
s1
=
_mm_adds_epi16
(
_v256_extract_low
(
s0
),
s1
);
return
(
short
)
_mm_cvtsi128_si32
(
s1
);
}
inline
int
v_reduce_sum
(
const
v_int32x8
&
a
)
{
__m256i
s0
=
_mm256_hadd_epi32
(
a
.
val
,
a
.
val
);
...
...
@@ -1101,6 +1104,11 @@ inline int v_reduce_sum(const v_int32x8& a)
inline
unsigned
v_reduce_sum
(
const
v_uint32x8
&
a
)
{
return
v_reduce_sum
(
v_reinterpret_as_s32
(
a
));
}
inline
int
v_reduce_sum
(
const
v_int16x16
&
a
)
{
return
v_reduce_sum
(
v_expand_low
(
a
)
+
v_expand_high
(
a
));
}
inline
unsigned
v_reduce_sum
(
const
v_uint16x16
&
a
)
{
return
v_reduce_sum
(
v_expand_low
(
a
)
+
v_expand_high
(
a
));
}
inline
float
v_reduce_sum
(
const
v_float32x8
&
a
)
{
__m256
s0
=
_mm256_hadd_ps
(
a
.
val
,
a
.
val
);
...
...
@@ -1112,6 +1120,18 @@ inline float v_reduce_sum(const v_float32x8& a)
return
_mm_cvtss_f32
(
s1
);
}
inline
uint64
v_reduce_sum
(
const
v_uint64x4
&
a
)
{
uint64
CV_DECL_ALIGNED
(
32
)
idx
[
2
];
_mm_store_si128
((
__m128i
*
)
idx
,
_mm_add_epi64
(
_v256_extract_low
(
a
.
val
),
_v256_extract_high
(
a
.
val
)));
return
idx
[
0
]
+
idx
[
1
];
}
inline
int64
v_reduce_sum
(
const
v_int64x4
&
a
)
{
int64
CV_DECL_ALIGNED
(
32
)
idx
[
2
];
_mm_store_si128
((
__m128i
*
)
idx
,
_mm_add_epi64
(
_v256_extract_low
(
a
.
val
),
_v256_extract_high
(
a
.
val
)));
return
idx
[
0
]
+
idx
[
1
];
}
inline
double
v_reduce_sum
(
const
v_float64x4
&
a
)
{
__m256d
s0
=
_mm256_hadd_pd
(
a
.
val
,
a
.
val
);
...
...
@@ -1166,26 +1186,39 @@ inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
}
/** Popcount **/
#define OPENCV_HAL_IMPL_AVX_POPCOUNT(_Tpvec) \
inline v_uint32x8 v_popcount(const _Tpvec& a) \
{ \
const v_uint32x8 m1 = v256_setall_u32(0x55555555); \
const v_uint32x8 m2 = v256_setall_u32(0x33333333); \
const v_uint32x8 m4 = v256_setall_u32(0x0f0f0f0f); \
v_uint32x8 p = v_reinterpret_as_u32(a); \
p = ((p >> 1) & m1) + (p & m1); \
p = ((p >> 2) & m2) + (p & m2); \
p = ((p >> 4) & m4) + (p & m4); \
p.val = _mm256_sad_epu8(p.val, _mm256_setzero_si256()); \
return p; \
}
OPENCV_HAL_IMPL_AVX_POPCOUNT
(
v_uint8x32
)
OPENCV_HAL_IMPL_AVX_POPCOUNT
(
v_int8x32
)
OPENCV_HAL_IMPL_AVX_POPCOUNT
(
v_uint16x16
)
OPENCV_HAL_IMPL_AVX_POPCOUNT
(
v_int16x16
)
OPENCV_HAL_IMPL_AVX_POPCOUNT
(
v_uint32x8
)
OPENCV_HAL_IMPL_AVX_POPCOUNT
(
v_int32x8
)
inline
v_uint8x32
v_popcount
(
const
v_uint8x32
&
a
)
{
__m256i
_popcnt_table
=
_mm256_setr_epi8
(
0
,
1
,
1
,
2
,
1
,
2
,
2
,
3
,
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
,
0
,
1
,
1
,
2
,
1
,
2
,
2
,
3
,
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
);
__m256i
_popcnt_mask
=
_mm256_set1_epi8
(
0x0F
);
return
v_uint8x32
(
_mm256_add_epi8
(
_mm256_shuffle_epi8
(
_popcnt_table
,
_mm256_and_si256
(
a
.
val
,
_popcnt_mask
)),
_mm256_shuffle_epi8
(
_popcnt_table
,
_mm256_and_si256
(
_mm256_srli_epi16
(
a
.
val
,
4
),
_popcnt_mask
))));
}
inline
v_uint16x16
v_popcount
(
const
v_uint16x16
&
a
)
{
v_uint8x32
p
=
v_popcount
(
v_reinterpret_as_u8
(
a
));
p
+=
v_rotate_right
<
1
>
(
p
);
return
v_reinterpret_as_u16
(
p
)
&
v256_setall_u16
(
0x00ff
);
}
inline
v_uint32x8
v_popcount
(
const
v_uint32x8
&
a
)
{
v_uint8x32
p
=
v_popcount
(
v_reinterpret_as_u8
(
a
));
p
+=
v_rotate_right
<
1
>
(
p
);
p
+=
v_rotate_right
<
2
>
(
p
);
return
v_reinterpret_as_u32
(
p
)
&
v256_setall_u32
(
0x000000ff
);
}
inline
v_uint64x4
v_popcount
(
const
v_uint64x4
&
a
)
{
return
v_uint64x4
(
_mm256_sad_epu8
(
v_popcount
(
v_reinterpret_as_u8
(
a
)).
val
,
_mm256_setzero_si256
()));
}
inline
v_uint8x32
v_popcount
(
const
v_int8x32
&
a
)
{
return
v_popcount
(
v_reinterpret_as_u8
(
a
));
}
inline
v_uint16x16
v_popcount
(
const
v_int16x16
&
a
)
{
return
v_popcount
(
v_reinterpret_as_u16
(
a
));
}
inline
v_uint32x8
v_popcount
(
const
v_int32x8
&
a
)
{
return
v_popcount
(
v_reinterpret_as_u32
(
a
));
}
inline
v_uint64x4
v_popcount
(
const
v_int64x4
&
a
)
{
return
v_popcount
(
v_reinterpret_as_u64
(
a
));
}
/** Mask **/
inline
int
v_signmask
(
const
v_int8x32
&
a
)
...
...
modules/core/include/opencv2/core/hal/intrin_cpp.hpp
View file @
84fd8190
...
...
@@ -603,27 +603,20 @@ static const unsigned char popCountTable[] =
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
4
,
5
,
5
,
6
,
5
,
6
,
6
,
7
,
4
,
5
,
5
,
6
,
5
,
6
,
6
,
7
,
5
,
6
,
6
,
7
,
6
,
7
,
7
,
8
,
};
/** @brief Count the 1 bits in the vector
and return 4 values
/** @brief Count the 1 bits in the vector
lanes and return result as corresponding unsigned type
Scheme:
@code
{A1 A2 A3 ...} =>
popcount(A1)
{A1 A2 A3 ...} =>
{popcount(A1), popcount(A2), popcount(A3), ...}
@endcode
Any types but result will be in v_uint32x4*/
template
<
typename
_Tp
,
int
n
>
inline
v_uint32x4
v_popcount
(
const
v_reg
<
_Tp
,
n
>&
a
)
For all integer types. */
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
abs_type
,
n
>
v_popcount
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
v_uint8x16
b
;
b
=
v_reinterpret_as_u8
(
a
);
for
(
int
i
=
0
;
i
<
v_uint8x16
::
nlanes
;
i
++
)
{
b
.
s
[
i
]
=
popCountTable
[
b
.
s
[
i
]];
}
v_uint32x4
c
;
for
(
int
i
=
0
;
i
<
v_uint32x4
::
nlanes
;
i
++
)
{
c
.
s
[
i
]
=
b
.
s
[
i
*
4
]
+
b
.
s
[
i
*
4
+
1
]
+
b
.
s
[
i
*
4
+
2
]
+
b
.
s
[
i
*
4
+
3
];
}
return
c
;
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
abs_type
,
n
>
b
=
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
abs_type
,
n
>::
zero
();
for
(
int
i
=
0
;
i
<
n
*
sizeof
(
_Tp
);
i
++
)
b
.
s
[
i
/
sizeof
(
_Tp
)]
+=
popCountTable
[
v_reinterpret_as_u8
(
a
).
s
[
i
]];
return
b
;
}
...
...
modules/core/include/opencv2/core/hal/intrin_neon.hpp
View file @
84fd8190
...
...
@@ -910,6 +910,31 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP
(
v_float64x2
,
double
,
f64
)
#endif
inline
unsigned
v_reduce_sum
(
const
v_uint8x16
&
a
)
{
uint32x4_t
t0
=
vpaddlq_u16
(
vpaddlq_u8
(
a
.
val
));
uint32x2_t
t1
=
vpadd_u32
(
vget_low_u32
(
t0
),
vget_high_u32
(
t0
));
return
vget_lane_u32
(
vpadd_u32
(
t1
,
t1
),
0
);
}
inline
int
v_reduce_sum
(
const
v_int8x16
&
a
)
{
int32x4_t
t0
=
vpaddlq_s16
(
vpaddlq_s8
(
a
.
val
));
int32x2_t
t1
=
vpadd_s32
(
vget_low_s32
(
t0
),
vget_high_s32
(
t0
));
return
vget_lane_s32
(
vpadd_s32
(
t1
,
t1
),
0
);
}
inline
unsigned
v_reduce_sum
(
const
v_uint16x8
&
a
)
{
uint32x4_t
t0
=
vpaddlq_u16
(
a
.
val
);
uint32x2_t
t1
=
vpadd_u32
(
vget_low_u32
(
t0
),
vget_high_u32
(
t0
));
return
vget_lane_u32
(
vpadd_u32
(
t1
,
t1
),
0
);
}
inline
int
v_reduce_sum
(
const
v_int16x8
&
a
)
{
int32x4_t
t0
=
vpaddlq_s16
(
a
.
val
);
int32x2_t
t1
=
vpadd_s32
(
vget_low_s32
(
t0
),
vget_high_s32
(
t0
));
return
vget_lane_s32
(
vpadd_s32
(
t1
,
t1
),
0
);
}
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
inline scalartype v_reduce_##func(const _Tpvec& a) \
{ \
...
...
@@ -918,12 +943,10 @@ inline scalartype v_reduce_##func(const _Tpvec& a) \
return (scalartype)vget_lane_##suffix(vp##vectorfunc##_##suffix(a0, a0),0); \
}
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8
(
v_uint16x8
,
uint16x4
,
unsigned
short
,
sum
,
add
,
u16
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8
(
v_uint16x8
,
uint16x4
,
unsigned
short
,
max
,
max
,
u16
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8
(
v_uint16x8
,
uint16x4
,
unsigned
short
,
min
,
min
,
u16
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8
(
v_int16x8
,
int16x4
,
short
,
sum
,
add
,
s16
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8
(
v_int16x8
,
int16x4
,
short
,
max
,
max
,
s16
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8
(
v_int16x8
,
int16x4
,
short
,
min
,
min
,
s16
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8
(
v_uint16x8
,
uint16x4
,
unsigned
int
,
max
,
max
,
u16
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8
(
v_uint16x8
,
uint16x4
,
unsigned
int
,
min
,
min
,
u16
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8
(
v_int16x8
,
int16x4
,
int
,
max
,
max
,
s16
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_8
(
v_int16x8
,
int16x4
,
int
,
min
,
min
,
s16
)
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
inline scalartype v_reduce_##func(const _Tpvec& a) \
...
...
@@ -942,6 +965,10 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4
(
v_float32x4
,
float32x2
,
float
,
max
,
max
,
f32
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4
(
v_float32x4
,
float32x2
,
float
,
min
,
min
,
f32
)
inline
uint64
v_reduce_sum
(
const
v_uint64x2
&
a
)
{
return
vget_lane_u64
(
vadd_u64
(
vget_low_u64
(
a
.
val
),
vget_high_u64
(
a
.
val
)),
0
);
}
inline
int64
v_reduce_sum
(
const
v_int64x2
&
a
)
{
return
vget_lane_s64
(
vadd_s64
(
vget_low_s64
(
a
.
val
),
vget_high_s64
(
a
.
val
)),
0
);
}
#if CV_SIMD128_64F
inline
double
v_reduce_sum
(
const
v_float64x2
&
a
)
{
...
...
@@ -1007,21 +1034,22 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
return
vget_lane_f32
(
vpadd_f32
(
t1
,
t1
),
0
);
}
#define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \
inline v_uint32x4 v_popcount(const _Tpvec& a) \
{ \
uint8x16_t t = vcntq_u8(cast(a.val)); \
uint16x8_t t0 = vpaddlq_u8(t);
/* 16 -> 8 */
\
uint32x4_t t1 = vpaddlq_u16(t0);
/* 8 -> 4 */
\
return v_uint32x4(t1); \
}
OPENCV_HAL_IMPL_NEON_POPCOUNT
(
v_uint8x16
,
OPENCV_HAL_NOP
)
OPENCV_HAL_IMPL_NEON_POPCOUNT
(
v_uint16x8
,
vreinterpretq_u8_u16
)
OPENCV_HAL_IMPL_NEON_POPCOUNT
(
v_uint32x4
,
vreinterpretq_u8_u32
)
OPENCV_HAL_IMPL_NEON_POPCOUNT
(
v_int8x16
,
vreinterpretq_u8_s8
)
OPENCV_HAL_IMPL_NEON_POPCOUNT
(
v_int16x8
,
vreinterpretq_u8_s16
)
OPENCV_HAL_IMPL_NEON_POPCOUNT
(
v_int32x4
,
vreinterpretq_u8_s32
)
inline
v_uint8x16
v_popcount
(
const
v_uint8x16
&
a
)
{
return
v_uint8x16
(
vcntq_u8
(
a
.
val
));
}
inline
v_uint8x16
v_popcount
(
const
v_int8x16
&
a
)
{
return
v_uint8x16
(
vcntq_u8
(
vreinterpretq_u8_s8
(
a
.
val
)));
}
inline
v_uint16x8
v_popcount
(
const
v_uint16x8
&
a
)
{
return
v_uint16x8
(
vpaddlq_u8
(
vcntq_u8
(
vreinterpretq_u8_u16
(
a
.
val
))));
}
inline
v_uint16x8
v_popcount
(
const
v_int16x8
&
a
)
{
return
v_uint16x8
(
vpaddlq_u8
(
vcntq_u8
(
vreinterpretq_u8_s16
(
a
.
val
))));
}
inline
v_uint32x4
v_popcount
(
const
v_uint32x4
&
a
)
{
return
v_uint32x4
(
vpaddlq_u16
(
vpaddlq_u8
(
vcntq_u8
(
vreinterpretq_u8_u32
(
a
.
val
)))));
}
inline
v_uint32x4
v_popcount
(
const
v_int32x4
&
a
)
{
return
v_uint32x4
(
vpaddlq_u16
(
vpaddlq_u8
(
vcntq_u8
(
vreinterpretq_u8_s32
(
a
.
val
)))));
}
inline
v_uint64x2
v_popcount
(
const
v_uint64x2
&
a
)
{
return
v_uint64x2
(
vpaddlq_u32
(
vpaddlq_u16
(
vpaddlq_u8
(
vcntq_u8
(
vreinterpretq_u8_u64
(
a
.
val
))))));
}
inline
v_uint64x2
v_popcount
(
const
v_int64x2
&
a
)
{
return
v_uint64x2
(
vpaddlq_u32
(
vpaddlq_u16
(
vpaddlq_u8
(
vcntq_u8
(
vreinterpretq_u8_s64
(
a
.
val
))))));
}
inline
int
v_signmask
(
const
v_uint8x16
&
a
)
{
...
...
modules/core/include/opencv2/core/hal/intrin_sse.hpp
View file @
84fd8190
...
...
@@ -302,8 +302,8 @@ inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)
template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
{ return _Tpvec(cast(a.val)); }
OPENCV_HAL_IMPL_SSE_INITVEC
(
v_uint8x16
,
uchar
,
u8
,
si128
,
epi8
,
char
,
OPENCV_HAL_NOP
)
OPENCV_HAL_IMPL_SSE_INITVEC
(
v_int8x16
,
schar
,
s8
,
si128
,
epi8
,
char
,
OPENCV_HAL_NOP
)
OPENCV_HAL_IMPL_SSE_INITVEC
(
v_uint8x16
,
uchar
,
u8
,
si128
,
epi8
,
s
char
,
OPENCV_HAL_NOP
)
OPENCV_HAL_IMPL_SSE_INITVEC
(
v_int8x16
,
schar
,
s8
,
si128
,
epi8
,
s
char
,
OPENCV_HAL_NOP
)
OPENCV_HAL_IMPL_SSE_INITVEC
(
v_uint16x8
,
ushort
,
u16
,
si128
,
epi16
,
short
,
OPENCV_HAL_NOP
)
OPENCV_HAL_IMPL_SSE_INITVEC
(
v_int16x8
,
short
,
s16
,
si128
,
epi16
,
short
,
OPENCV_HAL_NOP
)
OPENCV_HAL_IMPL_SSE_INITVEC
(
v_uint32x4
,
unsigned
,
u32
,
si128
,
epi32
,
int
,
OPENCV_HAL_NOP
)
...
...
@@ -1393,6 +1393,41 @@ inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP
(
v_float32x4
,
float
,
ps
)
OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP
(
v_float64x2
,
double
,
pd
)
inline
unsigned
v_reduce_sum
(
const
v_uint8x16
&
a
)
{
__m128i
half
=
_mm_sad_epu8
(
a
.
val
,
_mm_setzero_si128
());
return
(
unsigned
)
_mm_cvtsi128_si32
(
_mm_add_epi32
(
half
,
_mm_unpackhi_epi64
(
half
,
half
)));
}
inline
int
v_reduce_sum
(
const
v_int8x16
&
a
)
{
__m128i
half
=
_mm_set1_epi8
((
schar
)
-
128
);
half
=
_mm_sad_epu8
(
_mm_xor_si128
(
a
.
val
,
half
),
_mm_setzero_si128
());
return
_mm_cvtsi128_si32
(
_mm_add_epi32
(
half
,
_mm_unpackhi_epi64
(
half
,
half
)))
-
2048
;
}
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_16(func) \
inline schar v_reduce_##func(const v_int8x16& a) \
{ \
__m128i val = a.val; \
__m128i smask = _mm_set1_epi8((schar)-128); \
val = _mm_xor_si128(val, smask); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
return (schar)_mm_cvtsi128_si32(val) ^ (schar)-128; \
} \
inline uchar v_reduce_##func(const v_uint8x16& a) \
{ \
__m128i val = a.val; \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,8)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,4)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,2)); \
val = _mm_##func##_epu8(val, _mm_srli_si128(val,1)); \
return (uchar)_mm_cvtsi128_si32(val); \
}
OPENCV_HAL_IMPL_SSE_REDUCE_OP_16
(
max
)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_16
(
min
)
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8(_Tpvec, scalartype, func, suffix, sbit) \
inline scalartype v_reduce_##func(const v_##_Tpvec& a) \
{ \
...
...
@@ -1412,26 +1447,8 @@ inline unsigned scalartype v_reduce_##func(const v_u##_Tpvec& a) \
val = _mm_##func##_##suffix(val, _mm_srli_si128(val,2)); \
return (unsigned scalartype)(_mm_cvtsi128_si32(val) ^ sbit); \
}
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM(_Tpvec, scalartype, suffix) \
inline scalartype v_reduce_sum(const v_##_Tpvec& a) \
{ \
__m128i val = a.val; \
val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 8)); \
val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 4)); \
val = _mm_adds_epi##suffix(val, _mm_srli_si128(val, 2)); \
return (scalartype)_mm_cvtsi128_si32(val); \
} \
inline unsigned scalartype v_reduce_sum(const v_u##_Tpvec& a) \
{ \
__m128i val = a.val; \
val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 8)); \
val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 4)); \
val = _mm_adds_epu##suffix(val, _mm_srli_si128(val, 2)); \
return (unsigned scalartype)_mm_cvtsi128_si32(val); \
}
OPENCV_HAL_IMPL_SSE_REDUCE_OP_8
(
int16x8
,
short
,
max
,
epi16
,
(
short
)
-
32768
)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_8
(
int16x8
,
short
,
min
,
epi16
,
(
short
)
-
32768
)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_8_SUM
(
int16x8
,
short
,
16
)
#define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(_Tpvec, scalartype, regtype, suffix, cast_from, cast_to, extract) \
inline scalartype v_reduce_sum(const _Tpvec& a) \
...
...
@@ -1456,6 +1473,23 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_uint32x4, unsigned, __m128i, epi32, OPENCV
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM
(
v_int32x4
,
int
,
__m128i
,
epi32
,
OPENCV_HAL_NOP
,
OPENCV_HAL_NOP
,
si128_si32
)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM
(
v_float32x4
,
float
,
__m128
,
ps
,
_mm_castps_si128
,
_mm_castsi128_ps
,
ss_f32
)
inline
int
v_reduce_sum
(
const
v_int16x8
&
a
)
{
return
v_reduce_sum
(
v_expand_low
(
a
)
+
v_expand_high
(
a
));
}
inline
unsigned
v_reduce_sum
(
const
v_uint16x8
&
a
)
{
return
v_reduce_sum
(
v_expand_low
(
a
)
+
v_expand_high
(
a
));
}
inline
uint64
v_reduce_sum
(
const
v_uint64x2
&
a
)
{
uint64
CV_DECL_ALIGNED
(
32
)
idx
[
2
];
v_store_aligned
(
idx
,
a
);
return
idx
[
0
]
+
idx
[
1
];
}
inline
int64
v_reduce_sum
(
const
v_int64x2
&
a
)
{
int64
CV_DECL_ALIGNED
(
32
)
idx
[
2
];
v_store_aligned
(
idx
,
a
);
return
idx
[
0
]
+
idx
[
1
];
}
inline
double
v_reduce_sum
(
const
v_float64x2
&
a
)
{
double
CV_DECL_ALIGNED
(
32
)
idx
[
2
];
...
...
@@ -1520,27 +1554,42 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
return
v_reduce_sum
(
v_absdiff
(
a
,
b
));
}
#define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \
inline v_uint32x4 v_popcount(const _Tpvec& a) \
{ \
__m128i m1 = _mm_set1_epi32(0x55555555); \
__m128i m2 = _mm_set1_epi32(0x33333333); \
__m128i m4 = _mm_set1_epi32(0x0f0f0f0f); \
__m128i p = a.val; \
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); \
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); \
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); \
p = _mm_adds_epi8(p, _mm_srli_si128(p, 1)); \
p = _mm_adds_epi8(p, _mm_srli_si128(p, 2)); \
return v_uint32x4(_mm_and_si128(p, _mm_set1_epi32(0x000000ff))); \
}
OPENCV_HAL_IMPL_SSE_POPCOUNT
(
v_uint8x16
)
OPENCV_HAL_IMPL_SSE_POPCOUNT
(
v_uint16x8
)
OPENCV_HAL_IMPL_SSE_POPCOUNT
(
v_uint32x4
)
OPENCV_HAL_IMPL_SSE_POPCOUNT
(
v_int8x16
)
OPENCV_HAL_IMPL_SSE_POPCOUNT
(
v_int16x8
)
OPENCV_HAL_IMPL_SSE_POPCOUNT
(
v_int32x4
)
inline
v_uint8x16
v_popcount
(
const
v_uint8x16
&
a
)
{
__m128i
m1
=
_mm_set1_epi32
(
0x55555555
);
__m128i
m2
=
_mm_set1_epi32
(
0x33333333
);
__m128i
m4
=
_mm_set1_epi32
(
0x0f0f0f0f
);
__m128i
p
=
a
.
val
;
p
=
_mm_add_epi32
(
_mm_and_si128
(
_mm_srli_epi32
(
p
,
1
),
m1
),
_mm_and_si128
(
p
,
m1
));
p
=
_mm_add_epi32
(
_mm_and_si128
(
_mm_srli_epi32
(
p
,
2
),
m2
),
_mm_and_si128
(
p
,
m2
));
p
=
_mm_add_epi32
(
_mm_and_si128
(
_mm_srli_epi32
(
p
,
4
),
m4
),
_mm_and_si128
(
p
,
m4
));
return
v_uint8x16
(
p
);
}
inline
v_uint16x8
v_popcount
(
const
v_uint16x8
&
a
)
{
v_uint8x16
p
=
v_popcount
(
v_reinterpret_as_u8
(
a
));
p
+=
v_rotate_right
<
1
>
(
p
);
return
v_reinterpret_as_u16
(
p
)
&
v_setall_u16
(
0x00ff
);
}
inline
v_uint32x4
v_popcount
(
const
v_uint32x4
&
a
)
{
v_uint8x16
p
=
v_popcount
(
v_reinterpret_as_u8
(
a
));
p
+=
v_rotate_right
<
1
>
(
p
);
p
+=
v_rotate_right
<
2
>
(
p
);
return
v_reinterpret_as_u32
(
p
)
&
v_setall_u32
(
0x000000ff
);
}
inline
v_uint64x2
v_popcount
(
const
v_uint64x2
&
a
)
{
return
v_uint64x2
(
_mm_sad_epu8
(
v_popcount
(
v_reinterpret_as_u8
(
a
)).
val
,
_mm_setzero_si128
()));
}
inline
v_uint8x16
v_popcount
(
const
v_int8x16
&
a
)
{
return
v_popcount
(
v_reinterpret_as_u8
(
a
));
}
inline
v_uint16x8
v_popcount
(
const
v_int16x8
&
a
)
{
return
v_popcount
(
v_reinterpret_as_u16
(
a
));
}
inline
v_uint32x4
v_popcount
(
const
v_int32x4
&
a
)
{
return
v_popcount
(
v_reinterpret_as_u32
(
a
));
}
inline
v_uint64x2
v_popcount
(
const
v_int64x2
&
a
)
{
return
v_popcount
(
v_reinterpret_as_u64
(
a
));
}
#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
inline int v_signmask(const _Tpvec& a) \
...
...
modules/core/include/opencv2/core/hal/intrin_vsx.hpp
View file @
84fd8190
...
...
@@ -692,15 +692,27 @@ inline _Tpvec v_extract(const _Tpvec& a, const _Tpvec& b)
////////// Reduce and mask /////////
/** Reduce **/
inline
short
v_reduce_sum
(
const
v_int16x8
&
a
)
inline
uint
v_reduce_sum
(
const
v_uint8x16
&
a
)
{
const
vec_uint4
zero4
=
vec_uint4_z
;
vec_uint4
sum4
=
vec_sum4s
(
a
.
val
,
zero4
);
return
(
uint
)
vec_extract
(
vec_sums
(
vec_int4_c
(
sum4
),
vec_int4_c
(
zero4
)),
3
);
}
inline
int
v_reduce_sum
(
const
v_int8x16
&
a
)
{
const
vec_int4
zero4
=
vec_int4_z
;
vec_int4
sum4
=
vec_sum4s
(
a
.
val
,
zero4
);
return
(
int
)
vec_extract
(
vec_sums
(
sum4
,
zero4
),
3
);
}
inline
int
v_reduce_sum
(
const
v_int16x8
&
a
)
{
const
vec_int4
zero
=
vec_int4_z
;
return
saturate_cast
<
shor
t
>
(
vec_extract
(
vec_sums
(
vec_sum4s
(
a
.
val
,
zero
),
zero
),
3
));
return
saturate_cast
<
in
t
>
(
vec_extract
(
vec_sums
(
vec_sum4s
(
a
.
val
,
zero
),
zero
),
3
));
}
inline
u
shor
t
v_reduce_sum
(
const
v_uint16x8
&
a
)
inline
u
in
t
v_reduce_sum
(
const
v_uint16x8
&
a
)
{
const
vec_int4
v4
=
vec_int4_c
(
vec_unpackhu
(
vec_adds
(
a
.
val
,
vec_sld
(
a
.
val
,
a
.
val
,
8
))));
return
saturate_cast
<
u
shor
t
>
(
vec_extract
(
vec_sums
(
v4
,
vec_int4_z
),
3
));
return
saturate_cast
<
u
in
t
>
(
vec_extract
(
vec_sums
(
v4
,
vec_int4_z
),
3
));
}
#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(_Tpvec, _Tpvec2, scalartype, suffix, func) \
...
...
@@ -719,6 +731,14 @@ OPENCV_HAL_IMPL_VSX_REDUCE_OP_4(v_float32x4, vec_float4, float, sum, vec_add)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_4
(
v_float32x4
,
vec_float4
,
float
,
max
,
vec_max
)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_4
(
v_float32x4
,
vec_float4
,
float
,
min
,
vec_min
)
inline
uint64
v_reduce_sum
(
const
v_uint64x2
&
a
)
{
return
vec_extract
(
vec_add
(
a
.
val
,
vec_permi
(
a
.
val
,
a
.
val
,
3
)),
0
);
}
inline
int64
v_reduce_sum
(
const
v_int64x2
&
a
)
{
return
vec_extract
(
vec_add
(
a
.
val
,
vec_permi
(
a
.
val
,
a
.
val
,
3
)),
0
);
}
inline
double
v_reduce_sum
(
const
v_float64x2
&
a
)
{
return
vec_extract
(
vec_add
(
a
.
val
,
vec_permi
(
a
.
val
,
a
.
val
,
3
)),
0
);
...
...
@@ -736,6 +756,19 @@ OPENCV_HAL_IMPL_VSX_REDUCE_OP_8(v_uint16x8, vec_ushort8, ushort, min, vec_min)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8
(
v_int16x8
,
vec_short8
,
short
,
max
,
vec_max
)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8
(
v_int16x8
,
vec_short8
,
short
,
min
,
vec_min
)
#define OPENCV_HAL_IMPL_VSX_REDUCE_OP_16(_Tpvec, _Tpvec2, scalartype, suffix, func) \
inline scalartype v_reduce_##suffix(const _Tpvec& a) \
{ \
_Tpvec2 rs = func(a.val, vec_sld(a.val, a.val, 8)); \
rs = func(rs, vec_sld(rs, rs, 4)); \
rs = func(rs, vec_sld(rs, rs, 2)); \
return vec_extract(func(rs, vec_sld(rs, rs, 1)), 0); \
}
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8
(
v_uint8x16
,
vec_uchar16
,
uchar
,
max
,
vec_max
)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8
(
v_uint8x16
,
vec_uchar16
,
uchar
,
min
,
vec_min
)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8
(
v_int8x16
,
vec_char16
,
schar
,
max
,
vec_max
)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8
(
v_int8x16
,
vec_char16
,
schar
,
min
,
vec_min
)
inline
v_float32x4
v_reduce_sum4
(
const
v_float32x4
&
a
,
const
v_float32x4
&
b
,
const
v_float32x4
&
c
,
const
v_float32x4
&
d
)
{
...
...
@@ -792,9 +825,22 @@ inline float v_reduce_sad(const v_float32x4& a, const v_float32x4& b)
}
/** Popcount **/
template
<
typename
_Tpvec
>
inline
v_uint32x4
v_popcount
(
const
_Tpvec
&
a
)
{
return
v_uint32x4
(
vec_popcntu
(
vec_uint4_c
(
a
.
val
)));
}
inline
v_uint8x16
v_popcount
(
const
v_uint8x16
&
a
)
{
return
v_uint8x16
(
vec_popcntu
(
a
.
val
));
}
inline
v_uint8x16
v_popcount
(
const
v_int8x16
&
a
)
{
return
v_uint8x16
(
vec_popcntu
(
a
.
val
));
}
inline
v_uint16x8
v_popcount
(
const
v_uint16x8
&
a
)
{
return
v_uint16x8
(
vec_popcntu
(
a
.
val
));
}
inline
v_uint16x8
v_popcount
(
const
v_int16x8
&
a
)
{
return
v_uint16x8
(
vec_popcntu
(
a
.
val
));
}
inline
v_uint32x4
v_popcount
(
const
v_uint32x4
&
a
)
{
return
v_uint32x4
(
vec_popcntu
(
a
.
val
));
}
inline
v_uint32x4
v_popcount
(
const
v_int32x4
&
a
)
{
return
v_uint32x4
(
vec_popcntu
(
a
.
val
));
}
inline
v_uint64x2
v_popcount
(
const
v_uint64x2
&
a
)
{
return
v_uint64x2
(
vec_popcntu
(
a
.
val
));
}
inline
v_uint64x2
v_popcount
(
const
v_int64x2
&
a
)
{
return
v_uint64x2
(
vec_popcntu
(
a
.
val
));
}
/** Mask **/
inline
int
v_signmask
(
const
v_uint8x16
&
a
)
...
...
modules/core/src/stat.simd.hpp
View file @
84fd8190
...
...
@@ -32,28 +32,15 @@ int normHamming(const uchar* a, int n)
int
i
=
0
;
int
result
=
0
;
#if CV_AVX2
{
__m256i
_r0
=
_mm256_setzero_si256
();
__m256i
_0
=
_mm256_setzero_si256
();
__m256i
_popcnt_table
=
_mm256_setr_epi8
(
0
,
1
,
1
,
2
,
1
,
2
,
2
,
3
,
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
,
0
,
1
,
1
,
2
,
1
,
2
,
2
,
3
,
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
);
__m256i
_popcnt_mask
=
_mm256_set1_epi8
(
0x0F
);
for
(;
i
<=
n
-
32
;
i
+=
32
)
{
__m256i
_a0
=
_mm256_loadu_si256
((
const
__m256i
*
)(
a
+
i
));
__m256i
_popc0
=
_mm256_shuffle_epi8
(
_popcnt_table
,
_mm256_and_si256
(
_a0
,
_popcnt_mask
));
__m256i
_popc1
=
_mm256_shuffle_epi8
(
_popcnt_table
,
_mm256_and_si256
(
_mm256_srli_epi16
(
_a0
,
4
),
_popcnt_mask
));
_r0
=
_mm256_add_epi32
(
_r0
,
_mm256_sad_epu8
(
_0
,
_mm256_add_epi8
(
_popc0
,
_popc1
)));
}
_r0
=
_mm256_add_epi32
(
_r0
,
_mm256_shuffle_epi32
(
_r0
,
2
));
result
=
_mm256_extract_epi32_
(
_mm256_add_epi32
(
_r0
,
_mm256_permute2x128_si256
(
_r0
,
_r0
,
1
)),
0
);
#if CV_SIMD && CV_SIMD_WIDTH > 16
{
v_uint64
t
=
vx_setzero_u64
();
for
(;
i
<=
n
-
v_uint8
::
nlanes
;
i
+=
v_uint8
::
nlanes
)
t
+=
v_popcount
(
v_reinterpret_as_u64
(
vx_load
(
a
+
i
)));
result
=
(
int
)
v_reduce_sum
(
t
);
}
#endif
// CV_AVX2
#endif
#if CV_POPCNT
{
...
...
@@ -68,18 +55,14 @@ int normHamming(const uchar* a, int n)
result
+=
CV_POPCNT_U32
(
*
(
uint
*
)(
a
+
i
));
}
}
#endif // CV_POPCNT
#if CV_SIMD128
#elif CV_SIMD
{
v_uint
32x4
t
=
v_setzero_u32
();
v_uint
64x2
t
=
v_setzero_u64
();
for
(;
i
<=
n
-
v_uint8x16
::
nlanes
;
i
+=
v_uint8x16
::
nlanes
)
{
t
+=
v_popcount
(
v_load
(
a
+
i
));
}
result
+=
v_reduce_sum
(
t
);
t
+=
v_popcount
(
v_reinterpret_as_u64
(
v_load
(
a
+
i
)));
result
+=
(
int
)
v_reduce_sum
(
t
);
}
#endif
// CV_SIMD128
#endif
#if CV_ENABLE_UNROLLED
for
(;
i
<=
n
-
4
;
i
+=
4
)
{
...
...
@@ -100,31 +83,15 @@ int normHamming(const uchar* a, const uchar* b, int n)
int
i
=
0
;
int
result
=
0
;
#if CV_AVX2
{
__m256i
_r0
=
_mm256_setzero_si256
();
__m256i
_0
=
_mm256_setzero_si256
();
__m256i
_popcnt_table
=
_mm256_setr_epi8
(
0
,
1
,
1
,
2
,
1
,
2
,
2
,
3
,
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
,
0
,
1
,
1
,
2
,
1
,
2
,
2
,
3
,
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
);
__m256i
_popcnt_mask
=
_mm256_set1_epi8
(
0x0F
);
for
(;
i
<=
n
-
32
;
i
+=
32
)
{
__m256i
_a0
=
_mm256_loadu_si256
((
const
__m256i
*
)(
a
+
i
));
__m256i
_b0
=
_mm256_loadu_si256
((
const
__m256i
*
)(
b
+
i
));
__m256i
_xor
=
_mm256_xor_si256
(
_a0
,
_b0
);
__m256i
_popc0
=
_mm256_shuffle_epi8
(
_popcnt_table
,
_mm256_and_si256
(
_xor
,
_popcnt_mask
));
__m256i
_popc1
=
_mm256_shuffle_epi8
(
_popcnt_table
,
_mm256_and_si256
(
_mm256_srli_epi16
(
_xor
,
4
),
_popcnt_mask
));
_r0
=
_mm256_add_epi32
(
_r0
,
_mm256_sad_epu8
(
_0
,
_mm256_add_epi8
(
_popc0
,
_popc1
)));
}
_r0
=
_mm256_add_epi32
(
_r0
,
_mm256_shuffle_epi32
(
_r0
,
2
));
result
=
_mm256_extract_epi32_
(
_mm256_add_epi32
(
_r0
,
_mm256_permute2x128_si256
(
_r0
,
_r0
,
1
)),
0
);
#if CV_SIMD && CV_SIMD_WIDTH > 16
{
v_uint64
t
=
vx_setzero_u64
();
for
(;
i
<=
n
-
v_uint8
::
nlanes
;
i
+=
v_uint8
::
nlanes
)
t
+=
v_popcount
(
v_reinterpret_as_u64
(
vx_load
(
a
+
i
)
^
vx_load
(
b
+
i
)));
result
+=
(
int
)
v_reduce_sum
(
t
);
}
#endif
// CV_AVX2
#endif
#if CV_POPCNT
{
...
...
@@ -139,18 +106,14 @@ int normHamming(const uchar* a, const uchar* b, int n)
result
+=
CV_POPCNT_U32
(
*
(
uint
*
)(
a
+
i
)
^
*
(
uint
*
)(
b
+
i
));
}
}
#endif // CV_POPCNT
#if CV_SIMD128
#elif CV_SIMD
{
v_uint
32x4
t
=
v_setzero_u32
();
v_uint
64x2
t
=
v_setzero_u64
();
for
(;
i
<=
n
-
v_uint8x16
::
nlanes
;
i
+=
v_uint8x16
::
nlanes
)
{
t
+=
v_popcount
(
v_load
(
a
+
i
)
^
v_load
(
b
+
i
));
}
result
+=
v_reduce_sum
(
t
);
t
+=
v_popcount
(
v_reinterpret_as_u64
(
vx_load
(
a
+
i
)
^
vx_load
(
b
+
i
)));
result
+=
(
int
)
v_reduce_sum
(
t
);
}
#endif
// CV_SIMD128
#endif
#if CV_ENABLE_UNROLLED
for
(;
i
<=
n
-
4
;
i
+=
4
)
{
...
...
modules/core/test/test_intrin_utils.hpp
View file @
84fd8190
...
...
@@ -686,18 +686,24 @@ template<typename R> struct TheTest
TheTest
&
test_popcount
()
{
typedef
typename
V_RegTraits
<
R
>::
u_reg
Ru
;
static
unsigned
popcountTable
[]
=
{
0
,
1
,
2
,
4
,
5
,
7
,
9
,
12
,
13
,
15
,
17
,
20
,
22
,
25
,
28
,
32
,
33
,
35
,
37
,
40
,
42
,
45
,
48
,
52
,
54
,
57
,
60
,
64
,
67
,
71
,
75
,
80
,
81
,
83
,
85
,
88
,
90
,
93
,
96
,
100
,
102
,
105
,
108
,
112
,
115
,
119
,
123
,
128
,
130
,
133
,
136
,
140
,
143
,
147
,
151
,
156
,
159
,
163
,
167
,
172
,
176
,
181
,
186
,
192
,
193
0
,
1
,
1
,
2
,
1
,
2
,
2
,
3
,
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
,
//0x00-0x0f
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
,
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
//0x10-0x1f
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
,
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
//0x20-0x2f
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
//0x30-0x3f
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
,
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
//0x40-0x4f
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
//0x50-0x5f
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
//0x60-0x6f
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
4
,
5
,
5
,
6
,
5
,
6
,
6
,
7
,
//0x70-0x7f
1
//0x80
};
Data
<
R
>
dataA
;
R
a
=
dataA
;
unsigned
resB
=
(
unsigned
)
v_reduce_sum
(
v_popcount
(
a
));
EXPECT_EQ
(
popcountTable
[
R
::
nlanes
],
resB
);
Data
<
Ru
>
resB
=
v_popcount
(
a
);
for
(
int
i
=
0
;
i
<
Ru
::
nlanes
;
++
i
)
EXPECT_EQ
(
popcountTable
[
i
+
1
],
resB
[
i
]);
return
*
this
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment