Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
7a55f2af
Commit
7a55f2af
authored
May 14, 2019
by
Vitaly Tuzov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Updated AVX2 implementation of v_popcount for u8.
parent
1220dd48
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
32 additions
and
82 deletions
+32
-82
intrin_avx.hpp
modules/core/include/opencv2/core/hal/intrin_avx.hpp
+6
-16
intrin_sse.hpp
modules/core/include/opencv2/core/hal/intrin_sse.hpp
+1
-8
intrin_vsx.hpp
modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+2
-2
stat.simd.hpp
modules/core/src/stat.simd.hpp
+23
-56
No files found.
modules/core/include/opencv2/core/hal/intrin_avx.hpp
View file @
7a55f2af
...
@@ -1188,14 +1188,11 @@ inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
...
@@ -1188,14 +1188,11 @@ inline float v_reduce_sad(const v_float32x8& a, const v_float32x8& b)
/** Popcount **/
/** Popcount **/
inline
v_uint8x32
v_popcount
(
const
v_uint8x32
&
a
)
inline
v_uint8x32
v_popcount
(
const
v_uint8x32
&
a
)
{
{
__m256i
m1
=
_mm256_set1_epi32
(
0x55555555
);
__m256i
_popcnt_table
=
_mm256_setr_epi8
(
0
,
1
,
1
,
2
,
1
,
2
,
2
,
3
,
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
,
__m256i
m2
=
_mm256_set1_epi32
(
0x33333333
);
0
,
1
,
1
,
2
,
1
,
2
,
2
,
3
,
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
);
__m256i
m4
=
_mm256_set1_epi32
(
0x0f0f0f0f
);
__m256i
_popcnt_mask
=
_mm256_set1_epi8
(
0x0F
);
__m256i
p
=
a
.
val
;
return
v_uint8x32
(
_mm256_add_epi8
(
_mm256_shuffle_epi8
(
_popcnt_table
,
_mm256_and_si256
(
a
.
val
,
_popcnt_mask
)),
p
=
_mm256_add_epi32
(
_mm256_and_si256
(
_mm256_srli_epi32
(
p
,
1
),
m1
),
_mm256_and_si256
(
p
,
m1
));
_mm256_shuffle_epi8
(
_popcnt_table
,
_mm256_and_si256
(
_mm256_srli_epi16
(
a
.
val
,
4
),
_popcnt_mask
))));
p
=
_mm256_add_epi32
(
_mm256_and_si256
(
_mm256_srli_epi32
(
p
,
2
),
m2
),
_mm256_and_si256
(
p
,
m2
));
p
=
_mm256_add_epi32
(
_mm256_and_si256
(
_mm256_srli_epi32
(
p
,
4
),
m4
),
_mm256_and_si256
(
p
,
m4
));
return
v_uint8x32
(
p
);
}
}
inline
v_uint16x16
v_popcount
(
const
v_uint16x16
&
a
)
inline
v_uint16x16
v_popcount
(
const
v_uint16x16
&
a
)
{
{
...
@@ -1212,14 +1209,7 @@ inline v_uint32x8 v_popcount(const v_uint32x8& a)
...
@@ -1212,14 +1209,7 @@ inline v_uint32x8 v_popcount(const v_uint32x8& a)
}
}
inline
v_uint64x4
v_popcount
(
const
v_uint64x4
&
a
)
inline
v_uint64x4
v_popcount
(
const
v_uint64x4
&
a
)
{
{
__m256i
m1
=
_mm256_set1_epi32
(
0x55555555
);
return
v_uint64x4
(
_mm256_sad_epu8
(
v_popcount
(
v_reinterpret_as_u8
(
a
)).
val
,
_mm256_setzero_si256
()));
__m256i
m2
=
_mm256_set1_epi32
(
0x33333333
);
__m256i
m4
=
_mm256_set1_epi32
(
0x0f0f0f0f
);
__m256i
p
=
a
.
val
;
p
=
_mm256_add_epi32
(
_mm256_and_si256
(
_mm256_srli_epi32
(
p
,
1
),
m1
),
_mm256_and_si256
(
p
,
m1
));
p
=
_mm256_add_epi32
(
_mm256_and_si256
(
_mm256_srli_epi32
(
p
,
2
),
m2
),
_mm256_and_si256
(
p
,
m2
));
p
=
_mm256_add_epi32
(
_mm256_and_si256
(
_mm256_srli_epi32
(
p
,
4
),
m4
),
_mm256_and_si256
(
p
,
m4
));
return
v_uint64x4
(
_mm256_sad_epu8
(
p
,
_mm256_setzero_si256
()));
}
}
inline
v_uint8x32
v_popcount
(
const
v_int8x32
&
a
)
inline
v_uint8x32
v_popcount
(
const
v_int8x32
&
a
)
{
return
v_popcount
(
v_reinterpret_as_u8
(
a
));
}
{
return
v_popcount
(
v_reinterpret_as_u8
(
a
));
}
...
...
modules/core/include/opencv2/core/hal/intrin_sse.hpp
View file @
7a55f2af
...
@@ -1580,14 +1580,7 @@ inline v_uint32x4 v_popcount(const v_uint32x4& a)
...
@@ -1580,14 +1580,7 @@ inline v_uint32x4 v_popcount(const v_uint32x4& a)
}
}
inline
v_uint64x2
v_popcount
(
const
v_uint64x2
&
a
)
inline
v_uint64x2
v_popcount
(
const
v_uint64x2
&
a
)
{
{
__m128i
m1
=
_mm_set1_epi32
(
0x55555555
);
return
v_uint64x2
(
_mm_sad_epu8
(
v_popcount
(
v_reinterpret_as_u8
(
a
)).
val
,
_mm_setzero_si128
()));
__m128i
m2
=
_mm_set1_epi32
(
0x33333333
);
__m128i
m4
=
_mm_set1_epi32
(
0x0f0f0f0f
);
__m128i
p
=
a
.
val
;
p
=
_mm_add_epi32
(
_mm_and_si128
(
_mm_srli_epi32
(
p
,
1
),
m1
),
_mm_and_si128
(
p
,
m1
));
p
=
_mm_add_epi32
(
_mm_and_si128
(
_mm_srli_epi32
(
p
,
2
),
m2
),
_mm_and_si128
(
p
,
m2
));
p
=
_mm_add_epi32
(
_mm_and_si128
(
_mm_srli_epi32
(
p
,
4
),
m4
),
_mm_and_si128
(
p
,
m4
));
return
v_uint64x2
(
_mm_sad_epu8
(
p
,
_mm_setzero_si128
()));
}
}
inline
v_uint8x16
v_popcount
(
const
v_int8x16
&
a
)
inline
v_uint8x16
v_popcount
(
const
v_int8x16
&
a
)
{
return
v_popcount
(
v_reinterpret_as_u8
(
a
));
}
{
return
v_popcount
(
v_reinterpret_as_u8
(
a
));
}
...
...
modules/core/include/opencv2/core/hal/intrin_vsx.hpp
View file @
7a55f2af
...
@@ -766,8 +766,8 @@ inline scalartype v_reduce_##suffix(const _Tpvec& a)
...
@@ -766,8 +766,8 @@ inline scalartype v_reduce_##suffix(const _Tpvec& a)
}
}
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8
(
v_uint8x16
,
vec_uchar16
,
uchar
,
max
,
vec_max
)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8
(
v_uint8x16
,
vec_uchar16
,
uchar
,
max
,
vec_max
)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8
(
v_uint8x16
,
vec_uchar16
,
uchar
,
min
,
vec_min
)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8
(
v_uint8x16
,
vec_uchar16
,
uchar
,
min
,
vec_min
)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8
(
v_int8x16
,
vec_char
8
,
schar
,
max
,
vec_max
)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8
(
v_int8x16
,
vec_char
16
,
schar
,
max
,
vec_max
)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8
(
v_int8x16
,
vec_char
8
,
schar
,
min
,
vec_min
)
OPENCV_HAL_IMPL_VSX_REDUCE_OP_8
(
v_int8x16
,
vec_char
16
,
schar
,
min
,
vec_min
)
inline
v_float32x4
v_reduce_sum4
(
const
v_float32x4
&
a
,
const
v_float32x4
&
b
,
inline
v_float32x4
v_reduce_sum4
(
const
v_float32x4
&
a
,
const
v_float32x4
&
b
,
const
v_float32x4
&
c
,
const
v_float32x4
&
d
)
const
v_float32x4
&
c
,
const
v_float32x4
&
d
)
...
...
modules/core/src/stat.simd.hpp
View file @
7a55f2af
...
@@ -32,28 +32,15 @@ int normHamming(const uchar* a, int n)
...
@@ -32,28 +32,15 @@ int normHamming(const uchar* a, int n)
int
i
=
0
;
int
i
=
0
;
int
result
=
0
;
int
result
=
0
;
#if CV_AVX2
{
__m256i
_r0
=
_mm256_setzero_si256
();
__m256i
_0
=
_mm256_setzero_si256
();
__m256i
_popcnt_table
=
_mm256_setr_epi8
(
0
,
1
,
1
,
2
,
1
,
2
,
2
,
3
,
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
,
0
,
1
,
1
,
2
,
1
,
2
,
2
,
3
,
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
);
__m256i
_popcnt_mask
=
_mm256_set1_epi8
(
0x0F
);
for
(;
i
<=
n
-
32
;
i
+=
32
)
{
__m256i
_a0
=
_mm256_loadu_si256
((
const
__m256i
*
)(
a
+
i
));
__m256i
_popc0
=
_mm256_shuffle_epi8
(
_popcnt_table
,
_mm256_and_si256
(
_a0
,
_popcnt_mask
));
#if CV_SIMD && CV_SIMD_WIDTH > 16
__m256i
_popc1
=
_mm256_shuffle_epi8
(
_popcnt_table
,
{
_mm256_and_si256
(
_mm256_srli_epi16
(
_a0
,
4
),
_popcnt_mask
));
v_uint64
t
=
vx_setzero_u64
();
for
(;
i
<=
n
-
v_uint8
::
nlanes
;
i
+=
v_uint8
::
nlanes
)
_r0
=
_mm256_add_epi32
(
_r0
,
_mm256_sad_epu8
(
_0
,
_mm256_add_epi8
(
_popc0
,
_popc1
)));
t
+=
v_popcount
(
v_reinterpret_as_u64
(
vx_load
(
a
+
i
)));
}
result
=
(
int
)
v_reduce_sum
(
t
);
_r0
=
_mm256_add_epi32
(
_r0
,
_mm256_shuffle_epi32
(
_r0
,
2
));
result
=
_mm256_extract_epi32_
(
_mm256_add_epi32
(
_r0
,
_mm256_permute2x128_si256
(
_r0
,
_r0
,
1
)),
0
);
}
}
#endif
// CV_AVX2
#endif
#if CV_POPCNT
#if CV_POPCNT
{
{
...
@@ -68,16 +55,14 @@ int normHamming(const uchar* a, int n)
...
@@ -68,16 +55,14 @@ int normHamming(const uchar* a, int n)
result
+=
CV_POPCNT_U32
(
*
(
uint
*
)(
a
+
i
));
result
+=
CV_POPCNT_U32
(
*
(
uint
*
)(
a
+
i
));
}
}
}
}
#endif // CV_POPCNT
#elif CV_SIMD
#if CV_SIMD
{
{
v_uint64
t
=
vx
_setzero_u64
();
v_uint64
x2
t
=
v
_setzero_u64
();
for
(;
i
<=
n
-
v_uint8
::
nlanes
;
i
+=
v_uint8
::
nlanes
)
for
(;
i
<=
n
-
v_uint8
x16
::
nlanes
;
i
+=
v_uint8x16
::
nlanes
)
t
+=
v_popcount
(
v_reinterpret_as_u64
(
v
x
_load
(
a
+
i
)));
t
+=
v_popcount
(
v_reinterpret_as_u64
(
v_load
(
a
+
i
)));
result
+=
(
int
)
v_reduce_sum
(
t
);
result
+=
(
int
)
v_reduce_sum
(
t
);
}
}
#endif
// CV_SIMD
#endif
#if CV_ENABLE_UNROLLED
#if CV_ENABLE_UNROLLED
for
(;
i
<=
n
-
4
;
i
+=
4
)
for
(;
i
<=
n
-
4
;
i
+=
4
)
{
{
...
@@ -98,31 +83,15 @@ int normHamming(const uchar* a, const uchar* b, int n)
...
@@ -98,31 +83,15 @@ int normHamming(const uchar* a, const uchar* b, int n)
int
i
=
0
;
int
i
=
0
;
int
result
=
0
;
int
result
=
0
;
#if CV_AVX2
{
__m256i
_r0
=
_mm256_setzero_si256
();
__m256i
_0
=
_mm256_setzero_si256
();
__m256i
_popcnt_table
=
_mm256_setr_epi8
(
0
,
1
,
1
,
2
,
1
,
2
,
2
,
3
,
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
,
0
,
1
,
1
,
2
,
1
,
2
,
2
,
3
,
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
);
__m256i
_popcnt_mask
=
_mm256_set1_epi8
(
0x0F
);
for
(;
i
<=
n
-
32
;
i
+=
32
)
{
__m256i
_a0
=
_mm256_loadu_si256
((
const
__m256i
*
)(
a
+
i
));
__m256i
_b0
=
_mm256_loadu_si256
((
const
__m256i
*
)(
b
+
i
));
__m256i
_xor
=
_mm256_xor_si256
(
_a0
,
_b0
);
__m256i
_popc0
=
_mm256_shuffle_epi8
(
_popcnt_table
,
_mm256_and_si256
(
_xor
,
_popcnt_mask
));
#if CV_SIMD && CV_SIMD_WIDTH > 16
__m256i
_popc1
=
_mm256_shuffle_epi8
(
_popcnt_table
,
{
_mm256_and_si256
(
_mm256_srli_epi16
(
_xor
,
4
),
_popcnt_mask
));
v_uint64
t
=
vx_setzero_u64
();
for
(;
i
<=
n
-
v_uint8
::
nlanes
;
i
+=
v_uint8
::
nlanes
)
_r0
=
_mm256_add_epi32
(
_r0
,
_mm256_sad_epu8
(
_0
,
_mm256_add_epi8
(
_popc0
,
_popc1
)));
t
+=
v_popcount
(
v_reinterpret_as_u64
(
vx_load
(
a
+
i
)
^
vx_load
(
b
+
i
)));
}
result
+=
(
int
)
v_reduce_sum
(
t
);
_r0
=
_mm256_add_epi32
(
_r0
,
_mm256_shuffle_epi32
(
_r0
,
2
));
result
=
_mm256_extract_epi32_
(
_mm256_add_epi32
(
_r0
,
_mm256_permute2x128_si256
(
_r0
,
_r0
,
1
)),
0
);
}
}
#endif
// CV_AVX2
#endif
#if CV_POPCNT
#if CV_POPCNT
{
{
...
@@ -137,16 +106,14 @@ int normHamming(const uchar* a, const uchar* b, int n)
...
@@ -137,16 +106,14 @@ int normHamming(const uchar* a, const uchar* b, int n)
result
+=
CV_POPCNT_U32
(
*
(
uint
*
)(
a
+
i
)
^
*
(
uint
*
)(
b
+
i
));
result
+=
CV_POPCNT_U32
(
*
(
uint
*
)(
a
+
i
)
^
*
(
uint
*
)(
b
+
i
));
}
}
}
}
#endif // CV_POPCNT
#elif CV_SIMD
#if CV_SIMD
{
{
v_uint64
t
=
vx
_setzero_u64
();
v_uint64
x2
t
=
v
_setzero_u64
();
for
(;
i
<=
n
-
v_uint8
::
nlanes
;
i
+=
v_uint8
::
nlanes
)
for
(;
i
<=
n
-
v_uint8
x16
::
nlanes
;
i
+=
v_uint8x16
::
nlanes
)
t
+=
v_popcount
(
v_reinterpret_as_u64
(
vx_load
(
a
+
i
)
^
vx_load
(
b
+
i
)));
t
+=
v_popcount
(
v_reinterpret_as_u64
(
vx_load
(
a
+
i
)
^
vx_load
(
b
+
i
)));
result
+=
(
int
)
v_reduce_sum
(
t
);
result
+=
(
int
)
v_reduce_sum
(
t
);
}
}
#endif
// CV_SIMD
#endif
#if CV_ENABLE_UNROLLED
#if CV_ENABLE_UNROLLED
for
(;
i
<=
n
-
4
;
i
+=
4
)
for
(;
i
<=
n
-
4
;
i
+=
4
)
{
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment