Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
a22f03e7
Commit
a22f03e7
authored
Jan 18, 2017
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #7863 from tomoaki0705:universalIntrinsicPopcount
parents
8f96b15e
8b22099d
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
177 additions
and
43 deletions
+177
-43
cvdef.h
modules/core/include/opencv2/core/cvdef.h
+8
-0
intrin_cpp.hpp
modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+44
-1
intrin_neon.hpp
modules/core/include/opencv2/core/hal/intrin_neon.hpp
+16
-0
intrin_sse.hpp
modules/core/include/opencv2/core/hal/intrin_sse.hpp
+22
-0
stat.cpp
modules/core/src/stat.cpp
+69
-42
test_intrin.cpp
modules/core/test/test_intrin.cpp
+18
-0
No files found.
modules/core/include/opencv2/core/cvdef.h
View file @
a22f03e7
...
...
@@ -188,8 +188,16 @@ enum CpuFeatures {
# if defined __POPCNT__ || (defined _MSC_VER && _MSC_VER >= 1500)
# ifdef _MSC_VER
# include <nmmintrin.h>
# if defined(_M_X64)
# define CV_POPCNT_U64 _mm_popcnt_u64
# endif
# define CV_POPCNT_U32 _mm_popcnt_u32
# else
# include <popcntintrin.h>
# if defined(__x86_64__)
# define CV_POPCNT_U64 __builtin_popcountll
# endif
# define CV_POPCNT_U32 __builtin_popcount
# endif
# define CV_POPCNT 1
# endif
...
...
modules/core/include/opencv2/core/hal/intrin_cpp.hpp
View file @
a22f03e7
...
...
@@ -149,7 +149,7 @@ Element-wise binary and unary operations.
Most of these operations return only one value.
- Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum
- Reduce: @ref v_reduce_min, @ref v_reduce_max, @ref v_reduce_sum
, @ref v_popcount
- Mask: @ref v_signmask, @ref v_check_all, @ref v_check_any, @ref v_select
### Other math
...
...
@@ -574,6 +574,49 @@ Scheme:
For 32-bit integer and 32-bit floating point types. */
OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC
(
v_reduce_max
,
std
::
max
)
static
const
unsigned
char
popCountTable
[]
=
{
0
,
1
,
1
,
2
,
1
,
2
,
2
,
3
,
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
,
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
,
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
,
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
,
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
4
,
5
,
5
,
6
,
5
,
6
,
6
,
7
,
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
,
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
4
,
5
,
5
,
6
,
5
,
6
,
6
,
7
,
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
4
,
5
,
5
,
6
,
5
,
6
,
6
,
7
,
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
4
,
5
,
5
,
6
,
5
,
6
,
6
,
7
,
4
,
5
,
5
,
6
,
5
,
6
,
6
,
7
,
5
,
6
,
6
,
7
,
6
,
7
,
7
,
8
,
};
/** @brief Count the 1 bits in the vector and return 4 values
Scheme:
@code
{A1 A2 A3 ...} => popcount(A1)
@endcode
Any types but result will be in v_uint32x4*/
template
<
typename
_Tp
,
int
n
>
inline
v_uint32x4
v_popcount
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
v_uint8x16
b
;
b
=
v_reinterpret_as_u8
(
a
);
for
(
int
i
=
0
;
i
<
v_uint8x16
::
nlanes
;
i
++
)
{
b
.
s
[
i
]
=
popCountTable
[
b
.
s
[
i
]];
}
v_uint32x4
c
;
for
(
int
i
=
0
;
i
<
v_uint32x4
::
nlanes
;
i
++
)
{
c
.
s
[
i
]
=
b
.
s
[
i
*
4
]
+
b
.
s
[
i
*
4
+
1
]
+
b
.
s
[
i
*
4
+
2
]
+
b
.
s
[
i
*
4
+
3
];
}
return
c
;
}
//! @cond IGNORED
template
<
typename
_Tp
,
int
n
>
inline
void
v_minmax
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
,
...
...
modules/core/include/opencv2/core/hal/intrin_neon.hpp
View file @
a22f03e7
...
...
@@ -813,6 +813,22 @@ OPENCV_HAL_IMPL_NEON_REDUCE_OP_4(v_float32x4, float32x2, float, sum, add, f32)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4
(
v_float32x4
,
float32x2
,
float
,
max
,
max
,
f32
)
OPENCV_HAL_IMPL_NEON_REDUCE_OP_4
(
v_float32x4
,
float32x2
,
float
,
min
,
min
,
f32
)
#define OPENCV_HAL_IMPL_NEON_POPCOUNT(_Tpvec, cast) \
inline v_uint32x4 v_popcount(const _Tpvec& a) \
{ \
uint8x16_t t = vcntq_u8(cast(a.val)); \
uint16x8_t t0 = vpaddlq_u8(t);
/* 16 -> 8 */
\
uint32x4_t t1 = vpaddlq_u16(t0);
/* 8 -> 4 */
\
return v_uint32x4(t1); \
}
OPENCV_HAL_IMPL_NEON_POPCOUNT
(
v_uint8x16
,
OPENCV_HAL_NOP
)
OPENCV_HAL_IMPL_NEON_POPCOUNT
(
v_uint16x8
,
vreinterpretq_u8_u16
)
OPENCV_HAL_IMPL_NEON_POPCOUNT
(
v_uint32x4
,
vreinterpretq_u8_u32
)
OPENCV_HAL_IMPL_NEON_POPCOUNT
(
v_int8x16
,
vreinterpretq_u8_s8
)
OPENCV_HAL_IMPL_NEON_POPCOUNT
(
v_int16x8
,
vreinterpretq_u8_s16
)
OPENCV_HAL_IMPL_NEON_POPCOUNT
(
v_int32x4
,
vreinterpretq_u8_s32
)
inline
int
v_signmask
(
const
v_uint8x16
&
a
)
{
int8x8_t
m0
=
vcreate_s8
(
CV_BIG_UINT
(
0x0706050403020100
));
...
...
modules/core/include/opencv2/core/hal/intrin_sse.hpp
View file @
a22f03e7
...
...
@@ -1121,6 +1121,28 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4
(
v_float32x4
,
float
,
max
,
std
::
max
)
OPENCV_HAL_IMPL_SSE_REDUCE_OP_4
(
v_float32x4
,
float
,
min
,
std
::
min
)
#define OPENCV_HAL_IMPL_SSE_POPCOUNT(_Tpvec) \
inline v_uint32x4 v_popcount(const _Tpvec& a) \
{ \
__m128i m1 = _mm_set1_epi32(0x55555555); \
__m128i m2 = _mm_set1_epi32(0x33333333); \
__m128i m4 = _mm_set1_epi32(0x0f0f0f0f); \
__m128i p = a.val; \
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 1), m1), _mm_and_si128(p, m1)); \
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 2), m2), _mm_and_si128(p, m2)); \
p = _mm_add_epi32(_mm_and_si128(_mm_srli_epi32(p, 4), m4), _mm_and_si128(p, m4)); \
p = _mm_adds_epi8(p, _mm_srli_si128(p, 1)); \
p = _mm_adds_epi8(p, _mm_srli_si128(p, 2)); \
return v_uint32x4(_mm_and_si128(p, _mm_set1_epi32(0x000000ff))); \
}
OPENCV_HAL_IMPL_SSE_POPCOUNT
(
v_uint8x16
)
OPENCV_HAL_IMPL_SSE_POPCOUNT
(
v_uint16x8
)
OPENCV_HAL_IMPL_SSE_POPCOUNT
(
v_uint32x4
)
OPENCV_HAL_IMPL_SSE_POPCOUNT
(
v_int8x16
)
OPENCV_HAL_IMPL_SSE_POPCOUNT
(
v_int16x8
)
OPENCV_HAL_IMPL_SSE_POPCOUNT
(
v_int32x4
)
#define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
inline int v_signmask(const _Tpvec& a) \
{ \
...
...
modules/core/src/stat.cpp
View file @
a22f03e7
...
...
@@ -44,6 +44,7 @@
#include "precomp.hpp"
#include <climits>
#include <limits>
#include "opencv2/core/hal/intrin.hpp"
#include "opencl_kernels_core.hpp"
...
...
@@ -4238,22 +4239,8 @@ int normHamming(const uchar* a, int n)
{
int
i
=
0
;
int
result
=
0
;
#if CV_NEON
{
uint32x4_t
bits
=
vmovq_n_u32
(
0
);
for
(;
i
<=
n
-
16
;
i
+=
16
)
{
uint8x16_t
A_vec
=
vld1q_u8
(
a
+
i
);
uint8x16_t
bitsSet
=
vcntq_u8
(
A_vec
);
uint16x8_t
bitSet8
=
vpaddlq_u8
(
bitsSet
);
uint32x4_t
bitSet4
=
vpaddlq_u16
(
bitSet8
);
bits
=
vaddq_u32
(
bits
,
bitSet4
);
}
uint64x2_t
bitSet2
=
vpaddlq_u32
(
bits
);
result
=
vgetq_lane_s32
(
vreinterpretq_s32_u64
(
bitSet2
),
0
);
result
+=
vgetq_lane_s32
(
vreinterpretq_s32_u64
(
bitSet2
),
2
);
}
#elif CV_AVX2
if
(
USE_AVX2
)
#if CV_AVX2
if
(
USE_AVX2
)
{
__m256i
_r0
=
_mm256_setzero_si256
();
__m256i
_0
=
_mm256_setzero_si256
();
...
...
@@ -4274,12 +4261,40 @@ int normHamming(const uchar* a, int n)
_r0
=
_mm256_add_epi32
(
_r0
,
_mm256_shuffle_epi32
(
_r0
,
2
));
result
=
_mm256_extract_epi32_
(
_mm256_add_epi32
(
_r0
,
_mm256_permute2x128_si256
(
_r0
,
_r0
,
1
)),
0
);
}
#elif CV_POPCNT
if
(
checkHardwareSupport
(
CV_CPU_POPCNT
))
{
# if defined CV_POPCNT_U64
for
(;
i
<=
n
-
8
;
i
+=
8
)
{
result
+=
(
int
)
CV_POPCNT_U64
(
*
(
uint64
*
)(
a
+
i
));
}
# endif
for
(;
i
<=
n
-
4
;
i
+=
4
)
{
result
+=
CV_POPCNT_U32
(
*
(
uint
*
)(
a
+
i
));
}
}
#elif CV_SIMD128
if
(
hasSIMD128
())
{
v_uint32x4
t
=
v_setzero_u32
();
for
(;
i
<=
n
-
v_uint8x16
::
nlanes
;
i
+=
v_uint8x16
::
nlanes
)
{
t
+=
v_popcount
(
v_load
(
a
+
i
));
}
result
=
v_reduce_sum
(
t
);
}
#endif
for
(
;
i
<=
n
-
4
;
i
+=
4
)
result
+=
popCountTable
[
a
[
i
]]
+
popCountTable
[
a
[
i
+
1
]]
+
popCountTable
[
a
[
i
+
2
]]
+
popCountTable
[
a
[
i
+
3
]];
for
(
;
i
<
n
;
i
++
)
for
(;
i
<=
n
-
4
;
i
+=
4
)
{
result
+=
popCountTable
[
a
[
i
]]
+
popCountTable
[
a
[
i
+
1
]]
+
popCountTable
[
a
[
i
+
2
]]
+
popCountTable
[
a
[
i
+
3
]];
}
for
(;
i
<
n
;
i
++
)
{
result
+=
popCountTable
[
a
[
i
]];
}
return
result
;
}
...
...
@@ -4287,24 +4302,8 @@ int normHamming(const uchar* a, const uchar* b, int n)
{
int
i
=
0
;
int
result
=
0
;
#if CV_NEON
{
uint32x4_t
bits
=
vmovq_n_u32
(
0
);
for
(;
i
<=
n
-
16
;
i
+=
16
)
{
uint8x16_t
A_vec
=
vld1q_u8
(
a
+
i
);
uint8x16_t
B_vec
=
vld1q_u8
(
b
+
i
);
uint8x16_t
AxorB
=
veorq_u8
(
A_vec
,
B_vec
);
uint8x16_t
bitsSet
=
vcntq_u8
(
AxorB
);
uint16x8_t
bitSet8
=
vpaddlq_u8
(
bitsSet
);
uint32x4_t
bitSet4
=
vpaddlq_u16
(
bitSet8
);
bits
=
vaddq_u32
(
bits
,
bitSet4
);
}
uint64x2_t
bitSet2
=
vpaddlq_u32
(
bits
);
result
=
vgetq_lane_s32
(
vreinterpretq_s32_u64
(
bitSet2
),
0
);
result
+=
vgetq_lane_s32
(
vreinterpretq_s32_u64
(
bitSet2
),
2
);
}
#elif CV_AVX2
if
(
USE_AVX2
)
#if CV_AVX2
if
(
USE_AVX2
)
{
__m256i
_r0
=
_mm256_setzero_si256
();
__m256i
_0
=
_mm256_setzero_si256
();
...
...
@@ -4328,12 +4327,40 @@ int normHamming(const uchar* a, const uchar* b, int n)
_r0
=
_mm256_add_epi32
(
_r0
,
_mm256_shuffle_epi32
(
_r0
,
2
));
result
=
_mm256_extract_epi32_
(
_mm256_add_epi32
(
_r0
,
_mm256_permute2x128_si256
(
_r0
,
_r0
,
1
)),
0
);
}
#elif CV_POPCNT
if
(
checkHardwareSupport
(
CV_CPU_POPCNT
))
{
# if defined CV_POPCNT_U64
for
(;
i
<=
n
-
8
;
i
+=
8
)
{
result
+=
(
int
)
CV_POPCNT_U64
(
*
(
uint64
*
)(
a
+
i
)
^
*
(
uint64
*
)(
b
+
i
));
}
# endif
for
(;
i
<=
n
-
4
;
i
+=
4
)
{
result
+=
CV_POPCNT_U32
(
*
(
uint
*
)(
a
+
i
)
^
*
(
uint
*
)(
b
+
i
));
}
}
#elif CV_SIMD128
if
(
hasSIMD128
())
{
v_uint32x4
t
=
v_setzero_u32
();
for
(;
i
<=
n
-
v_uint8x16
::
nlanes
;
i
+=
v_uint8x16
::
nlanes
)
{
t
+=
v_popcount
(
v_load
(
a
+
i
)
^
v_load
(
b
+
i
));
}
result
=
v_reduce_sum
(
t
);
}
#endif
for
(
;
i
<=
n
-
4
;
i
+=
4
)
result
+=
popCountTable
[
a
[
i
]
^
b
[
i
]]
+
popCountTable
[
a
[
i
+
1
]
^
b
[
i
+
1
]]
+
popCountTable
[
a
[
i
+
2
]
^
b
[
i
+
2
]]
+
popCountTable
[
a
[
i
+
3
]
^
b
[
i
+
3
]];
for
(
;
i
<
n
;
i
++
)
for
(;
i
<=
n
-
4
;
i
+=
4
)
{
result
+=
popCountTable
[
a
[
i
]
^
b
[
i
]]
+
popCountTable
[
a
[
i
+
1
]
^
b
[
i
+
1
]]
+
popCountTable
[
a
[
i
+
2
]
^
b
[
i
+
2
]]
+
popCountTable
[
a
[
i
+
3
]
^
b
[
i
+
3
]];
}
for
(;
i
<
n
;
i
++
)
{
result
+=
popCountTable
[
a
[
i
]
^
b
[
i
]];
}
return
result
;
}
...
...
modules/core/test/test_intrin.cpp
View file @
a22f03e7
...
...
@@ -404,6 +404,18 @@ template<typename R> struct TheTest
return
*
this
;
}
TheTest
&
test_popcount
()
{
static
unsigned
popcountTable
[]
=
{
0
,
1
,
2
,
4
,
5
,
7
,
9
,
12
,
13
,
15
,
17
,
20
,
22
,
25
,
28
,
32
,
33
};
Data
<
R
>
dataA
;
R
a
=
dataA
;
unsigned
resB
=
(
unsigned
)
v_reduce_sum
(
v_popcount
(
a
));
EXPECT_EQ
(
popcountTable
[
R
::
nlanes
],
resB
);
return
*
this
;
}
TheTest
&
test_absdiff
()
{
typedef
typename
V_RegTrait128
<
LaneType
>::
u_reg
Ru
;
...
...
@@ -798,6 +810,7 @@ TEST(hal_intrin, uint8x16) {
.
test_min_max
()
.
test_absdiff
()
.
test_mask
()
.
test_popcount
()
.
test_pack
<
1
>
().
test_pack
<
2
>
().
test_pack
<
3
>
().
test_pack
<
8
>
()
.
test_pack_u
<
1
>
().
test_pack_u
<
2
>
().
test_pack_u
<
3
>
().
test_pack_u
<
8
>
()
.
test_unpack
()
...
...
@@ -819,6 +832,7 @@ TEST(hal_intrin, int8x16) {
.
test_absdiff
()
.
test_abs
()
.
test_mask
()
.
test_popcount
()
.
test_pack
<
1
>
().
test_pack
<
2
>
().
test_pack
<
3
>
().
test_pack
<
8
>
()
.
test_unpack
()
.
test_extract
<
0
>
().
test_extract
<
1
>
().
test_extract
<
8
>
().
test_extract
<
15
>
()
...
...
@@ -844,6 +858,7 @@ TEST(hal_intrin, uint16x8) {
.
test_absdiff
()
.
test_reduce
()
.
test_mask
()
.
test_popcount
()
.
test_pack
<
1
>
().
test_pack
<
2
>
().
test_pack
<
7
>
().
test_pack
<
16
>
()
.
test_pack_u
<
1
>
().
test_pack_u
<
2
>
().
test_pack_u
<
7
>
().
test_pack_u
<
16
>
()
.
test_unpack
()
...
...
@@ -870,6 +885,7 @@ TEST(hal_intrin, int16x8) {
.
test_abs
()
.
test_reduce
()
.
test_mask
()
.
test_popcount
()
.
test_pack
<
1
>
().
test_pack
<
2
>
().
test_pack
<
7
>
().
test_pack
<
16
>
()
.
test_unpack
()
.
test_extract
<
0
>
().
test_extract
<
1
>
().
test_extract
<
4
>
().
test_extract
<
7
>
()
...
...
@@ -894,6 +910,7 @@ TEST(hal_intrin, uint32x4) {
.
test_absdiff
()
.
test_reduce
()
.
test_mask
()
.
test_popcount
()
.
test_pack
<
1
>
().
test_pack
<
2
>
().
test_pack
<
15
>
().
test_pack
<
32
>
()
.
test_unpack
()
.
test_extract
<
0
>
().
test_extract
<
1
>
().
test_extract
<
2
>
().
test_extract
<
3
>
()
...
...
@@ -910,6 +927,7 @@ TEST(hal_intrin, int32x4) {
.
test_mul
()
.
test_abs
()
.
test_cmp
()
.
test_popcount
()
.
test_shift
<
1
>
().
test_shift
<
8
>
()
.
test_logic
()
.
test_min_max
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment