Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
cecc1857
Commit
cecc1857
authored
Oct 04, 2016
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #7373 from tomoaki0705:featureCannyUniversalIntrinsic
parents
9c90a5f5
841cccca
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
161 additions
and
155 deletions
+161
-155
intrin_neon.hpp
modules/core/include/opencv2/core/hal/intrin_neon.hpp
+7
-0
intrin_sse.hpp
modules/core/include/opencv2/core/hal/intrin_sse.hpp
+12
-0
test_intrin.cpp
modules/core/test/test_intrin.cpp
+21
-0
canny.cpp
modules/imgproc/src/canny.cpp
+121
-155
No files found.
modules/core/include/opencv2/core/hal/intrin_neon.hpp
View file @
cecc1857
...
...
@@ -549,6 +549,13 @@ inline v_float32x4 v_invsqrt(const v_float32x4& x)
}
#endif
#define OPENCV_HAL_IMPL_NEON_ABS(_Tpuvec, _Tpsvec, usuffix, ssuffix) \
inline _Tpuvec v_abs(const _Tpsvec& a) { return v_reinterpret_as_##usuffix(_Tpsvec(vabsq_##ssuffix(a.val))); }
OPENCV_HAL_IMPL_NEON_ABS
(
v_uint8x16
,
v_int8x16
,
u8
,
s8
)
OPENCV_HAL_IMPL_NEON_ABS
(
v_uint16x8
,
v_int16x8
,
u16
,
s16
)
OPENCV_HAL_IMPL_NEON_ABS
(
v_uint32x4
,
v_int32x4
,
u32
,
s32
)
inline
v_float32x4
v_abs
(
v_float32x4
x
)
{
return
v_float32x4
(
vabsq_f32
(
x
.
val
));
}
...
...
modules/core/include/opencv2/core/hal/intrin_sse.hpp
View file @
cecc1857
...
...
@@ -739,6 +739,18 @@ inline v_float64x2 v_invsqrt(const v_float64x2& x)
return
v_float64x2
(
_mm_div_pd
(
v_1
,
_mm_sqrt_pd
(
x
.
val
)));
}
#define OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC(_Tpuvec, _Tpsvec, func, suffix, subWidth) \
inline _Tpuvec v_abs(const _Tpsvec& x) \
{ return _Tpuvec(_mm_##func##_ep##suffix(x.val, _mm_sub_ep##subWidth(_mm_setzero_si128(), x.val))); }
OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC
(
v_uint8x16
,
v_int8x16
,
min
,
u8
,
i8
)
OPENCV_HAL_IMPL_SSE_ABS_INT_FUNC
(
v_uint16x8
,
v_int16x8
,
max
,
i16
,
i16
)
inline
v_uint32x4
v_abs
(
const
v_int32x4
&
x
)
{
__m128i
s
=
_mm_srli_epi32
(
x
.
val
,
31
);
__m128i
f
=
_mm_srai_epi32
(
x
.
val
,
31
);
return
v_uint32x4
(
_mm_add_epi32
(
_mm_xor_si128
(
x
.
val
,
f
),
s
));
}
inline
v_float32x4
v_abs
(
const
v_float32x4
&
x
)
{
return
v_float32x4
(
_mm_and_ps
(
x
.
val
,
_mm_castsi128_ps
(
_mm_set1_epi32
(
0x7fffffff
))));
}
inline
v_float64x2
v_abs
(
const
v_float64x2
&
x
)
...
...
modules/core/test/test_intrin.cpp
View file @
cecc1857
...
...
@@ -277,6 +277,24 @@ template<typename R> struct TheTest
return
*
this
;
}
TheTest
&
test_abs
()
{
typedef
typename
V_RegTrait128
<
LaneType
>::
u_reg
Ru
;
typedef
typename
Ru
::
lane_type
u_type
;
Data
<
R
>
dataA
,
dataB
(
10
);
R
a
=
dataA
,
b
=
dataB
;
a
=
a
-
b
;
Data
<
Ru
>
resC
=
v_abs
(
a
);
for
(
int
i
=
0
;
i
<
Ru
::
nlanes
;
++
i
)
{
EXPECT_EQ
((
u_type
)
std
::
abs
(
dataA
[
i
]
-
dataB
[
i
]),
resC
[
i
]);
}
return
*
this
;
}
template
<
int
s
>
TheTest
&
test_shift
()
{
...
...
@@ -799,6 +817,7 @@ TEST(hal_intrin, int8x16) {
.
test_logic
()
.
test_min_max
()
.
test_absdiff
()
.
test_abs
()
.
test_mask
()
.
test_pack
<
1
>
().
test_pack
<
2
>
().
test_pack
<
3
>
().
test_pack
<
8
>
()
.
test_unpack
()
...
...
@@ -847,6 +866,7 @@ TEST(hal_intrin, int16x8) {
.
test_logic
()
.
test_min_max
()
.
test_absdiff
()
.
test_abs
()
.
test_mask
()
.
test_pack
<
1
>
().
test_pack
<
2
>
().
test_pack
<
7
>
().
test_pack
<
16
>
()
.
test_unpack
()
...
...
@@ -886,6 +906,7 @@ TEST(hal_intrin, int32x4) {
.
test_expand
()
.
test_addsub
()
.
test_mul
()
.
test_abs
()
.
test_cmp
()
.
test_shift
<
1
>
().
test_shift
<
8
>
()
.
test_logic
()
...
...
modules/imgproc/src/canny.cpp
View file @
cecc1857
...
...
@@ -42,6 +42,7 @@
#include "precomp.hpp"
#include "opencl_kernels_imgproc.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include <queue>
#ifdef _MSC_VER
...
...
@@ -299,8 +300,8 @@ public:
void
operator
()(
const
Range
&
boundaries
)
const
{
#if CV_S
SE2
bool
haveS
SE2
=
checkHardwareSupport
(
CV_CPU_SSE2
);
#if CV_S
IMD128
bool
haveS
IMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
#endif
const
int
type
=
src
.
type
(),
cn
=
CV_MAT_CN
(
type
);
...
...
@@ -409,39 +410,28 @@ public:
if
(
!
L2gradient
)
{
int
j
=
0
,
width
=
src
.
cols
*
cn
;
#if CV_S
SE2
if
(
haveS
SE2
)
#if CV_S
IMD128
if
(
haveS
IMD
)
{
__m128i
v_zero
=
_mm_setzero_si128
();
for
(
;
j
<=
width
-
8
;
j
+=
8
)
{
__m128i
v_dx
=
_mm_loadu_si128
((
const
__m128i
*
)(
_dx
+
j
));
__m128i
v_dy
=
_mm_loadu_si128
((
const
__m128i
*
)(
_dy
+
j
));
v_int16x8
v_dx
=
v_load
((
const
short
*
)(
_dx
+
j
));
v_int16x8
v_dy
=
v_load
((
const
short
*
)(
_dy
+
j
));
__m128i
v_dx_abs
=
_mm_max_epi16
(
v_dx
,
_mm_sub_epi16
(
v_zero
,
v_dx
));
__m128i
v_dy_abs
=
_mm_max_epi16
(
v_dy
,
_mm_sub_epi16
(
v_zero
,
v_dy
));
v_dx
=
v_reinterpret_as_s16
(
v_abs
(
v_dx
));
v_dy
=
v_reinterpret_as_s16
(
v_abs
(
v_dy
));
__m128i
v_dx_ml
=
_mm_unpacklo_epi16
(
v_dx_abs
,
v_zero
);
__m128i
v_dy_ml
=
_mm_unpacklo_epi16
(
v_dy_abs
,
v_zero
);
__m128i
v_dx_mh
=
_mm_unpackhi_epi16
(
v_dx_abs
,
v_zero
);
__m128i
v_dy_mh
=
_mm_unpackhi_epi16
(
v_dy_abs
,
v_zero
);
v_int32x4
v_dx_ml
;
v_int32x4
v_dy_ml
;
v_int32x4
v_dx_mh
;
v_int32x4
v_dy_mh
;
v_expand
(
v_dx
,
v_dx_ml
,
v_dx_mh
);
v_expand
(
v_dy
,
v_dy_ml
,
v_dy_mh
);
__m128i
v_norm_ml
=
_mm_add_epi32
(
v_dx_ml
,
v_dy_ml
);
__m128i
v_norm_mh
=
_mm_add_epi32
(
v_dx_mh
,
v_dy_mh
);
_mm_storeu_si128
((
__m128i
*
)(
_norm
+
j
),
v_norm_ml
);
_mm_storeu_si128
((
__m128i
*
)(
_norm
+
j
+
4
),
v_norm_mh
);
v_store
((
int
*
)(
_norm
+
j
),
v_dx_ml
+
v_dy_ml
);
v_store
((
int
*
)(
_norm
+
j
+
4
),
v_dx_mh
+
v_dy_mh
);
}
}
#elif CV_NEON
for
(
;
j
<=
width
-
8
;
j
+=
8
)
{
int16x8_t
v_dx
=
vld1q_s16
(
_dx
+
j
),
v_dy
=
vld1q_s16
(
_dy
+
j
);
vst1q_s32
(
_norm
+
j
,
vaddq_s32
(
vabsq_s32
(
vmovl_s16
(
vget_low_s16
(
v_dx
))),
vabsq_s32
(
vmovl_s16
(
vget_low_s16
(
v_dy
)))));
vst1q_s32
(
_norm
+
j
+
4
,
vaddq_s32
(
vabsq_s32
(
vmovl_s16
(
vget_high_s16
(
v_dx
))),
vabsq_s32
(
vmovl_s16
(
vget_high_s16
(
v_dy
)))));
}
#endif
for
(
;
j
<
width
;
++
j
)
_norm
[
j
]
=
std
::
abs
(
int
(
_dx
[
j
]))
+
std
::
abs
(
int
(
_dy
[
j
]));
...
...
@@ -449,36 +439,23 @@ public:
else
{
int
j
=
0
,
width
=
src
.
cols
*
cn
;
#if CV_S
SE2
if
(
haveS
SE2
)
#if CV_S
IMD128
if
(
haveS
IMD
)
{
for
(
;
j
<=
width
-
8
;
j
+=
8
)
{
__m128i
v_dx
=
_mm_loadu_si128
((
const
__m128i
*
)(
_dx
+
j
));
__m128i
v_dy
=
_mm_loadu_si128
((
const
__m128i
*
)(
_dy
+
j
));
__m128i
v_dx_dy_ml
=
_mm_unpacklo_epi16
(
v_dx
,
v_dy
);
__m128i
v_dx_dy_mh
=
_mm_unpackhi_epi16
(
v_dx
,
v_dy
);
v_int16x8
v_dx
=
v_load
((
const
short
*
)(
_dx
+
j
));
v_int16x8
v_dy
=
v_load
((
const
short
*
)(
_dy
+
j
));
__m128i
v_norm_ml
=
_mm_madd_epi16
(
v_dx_dy_ml
,
v_dx_dy_ml
);
__m128i
v_norm_mh
=
_mm_madd_epi16
(
v_dx_dy_mh
,
v_dx_dy_mh
);
v_int32x4
v_dxp_low
,
v_dxp_high
;
v_int32x4
v_dyp_low
,
v_dyp_high
;
v_expand
(
v_dx
,
v_dxp_low
,
v_dxp_high
);
v_expand
(
v_dy
,
v_dyp_low
,
v_dyp_high
);
_mm_storeu_si128
((
__m128i
*
)(
_norm
+
j
),
v_norm_ml
);
_mm_storeu_si128
((
__m128i
*
)(
_norm
+
j
+
4
),
v_norm_m
h
);
v_store
((
int
*
)(
_norm
+
j
),
v_dxp_low
*
v_dxp_low
+
v_dyp_low
*
v_dyp_low
);
v_store
((
int
*
)(
_norm
+
j
+
4
),
v_dxp_high
*
v_dxp_high
+
v_dyp_high
*
v_dyp_hig
h
);
}
}
#elif CV_NEON
for
(
;
j
<=
width
-
8
;
j
+=
8
)
{
int16x8_t
v_dx
=
vld1q_s16
(
_dx
+
j
),
v_dy
=
vld1q_s16
(
_dy
+
j
);
int16x4_t
v_dxp
=
vget_low_s16
(
v_dx
),
v_dyp
=
vget_low_s16
(
v_dy
);
int32x4_t
v_dst
=
vmlal_s16
(
vmull_s16
(
v_dxp
,
v_dxp
),
v_dyp
,
v_dyp
);
vst1q_s32
(
_norm
+
j
,
v_dst
);
v_dxp
=
vget_high_s16
(
v_dx
),
v_dyp
=
vget_high_s16
(
v_dy
);
v_dst
=
vmlal_s16
(
vmull_s16
(
v_dxp
,
v_dxp
),
v_dyp
,
v_dyp
);
vst1q_s32
(
_norm
+
j
+
4
,
v_dst
);
}
#endif
for
(
;
j
<
width
;
++
j
)
_norm
[
j
]
=
int
(
_dx
[
j
])
*
_dx
[
j
]
+
int
(
_dy
[
j
])
*
_dy
[
j
];
...
...
@@ -529,30 +506,31 @@ public:
const
int
TG22
=
(
int
)(
0.4142135623730950488016887242097
*
(
1
<<
CANNY_SHIFT
)
+
0.5
);
int
prev_flag
=
0
,
j
=
0
;
#if CV_S
SE2
if
(
checkHardwareSupport
(
CPU_SSE2
)
)
#if CV_S
IMD128
if
(
haveSIMD
)
{
__m128i
v_low
=
_mm_set1_epi32
(
low
),
v_one
=
_mm_set1_epi8
(
1
);
v_int32x4
v_low
=
v_setall_s32
(
low
);
v_int8x16
v_one
=
v_setall_s8
(
1
);
for
(;
j
<=
src
.
cols
-
16
;
j
+=
16
)
{
__m128i
v_m1
=
_mm_loadu_si128
((
const
__m128i
*
)(
_mag
+
j
));
__m128i
v_m2
=
_mm_loadu_si128
((
const
__m128i
*
)(
_mag
+
j
+
4
));
__m128i
v_m3
=
_mm_loadu_si128
((
const
__m128i
*
)(
_mag
+
j
+
8
));
__m128i
v_m4
=
_mm_loadu_si128
((
const
__m128i
*
)(
_mag
+
j
+
12
));
v_int32x4
v_m1
=
v_load
((
const
int
*
)(
_mag
+
j
));
v_int32x4
v_m2
=
v_load
((
const
int
*
)(
_mag
+
j
+
4
));
v_int32x4
v_m3
=
v_load
((
const
int
*
)(
_mag
+
j
+
8
));
v_int32x4
v_m4
=
v_load
((
const
int
*
)(
_mag
+
j
+
12
));
_mm_storeu_si128
((
__m128i
*
)(
_map
+
j
),
v_one
);
v_store
((
signed
char
*
)(
_map
+
j
),
v_one
);
__m128i
v_cmp1
=
_mm_cmpgt_epi32
(
v_m1
,
v_low
)
;
__m128i
v_cmp2
=
_mm_cmpgt_epi32
(
v_m2
,
v_low
)
;
__m128i
v_cmp3
=
_mm_cmpgt_epi32
(
v_m3
,
v_low
)
;
__m128i
v_cmp4
=
_mm_cmpgt_epi32
(
v_m4
,
v_low
)
;
v_int32x4
v_cmp1
=
v_m1
>
v_low
;
v_int32x4
v_cmp2
=
v_m2
>
v_low
;
v_int32x4
v_cmp3
=
v_m3
>
v_low
;
v_int32x4
v_cmp4
=
v_m4
>
v_low
;
v_
cmp1
=
_mm_packs_epi32
(
v_cmp1
,
v_cmp2
);
v_
cmp2
=
_mm_packs_epi32
(
v_cmp3
,
v_cmp4
);
v_
int16x8
v_cmp80
=
v_pack
(
v_cmp1
,
v_cmp2
);
v_
int16x8
v_cmp81
=
v_pack
(
v_cmp3
,
v_cmp4
);
v_
cmp1
=
_mm_packs_epi16
(
v_cmp1
,
v_cmp2
);
unsigned
int
mask
=
_mm_movemask_epi8
(
v_cmp1
);
v_
int8x16
v_cmp
=
v_pack
(
v_cmp80
,
v_cmp81
);
unsigned
int
mask
=
v_signmask
(
v_cmp
);
if
(
mask
)
{
...
...
@@ -730,54 +708,57 @@ public:
const
uchar
*
pmap
=
map
+
mapstep
+
1
+
(
ptrdiff_t
)(
mapstep
*
boundaries
.
start
);
uchar
*
pdst
=
dst
.
ptr
()
+
(
ptrdiff_t
)(
dst
.
step
*
boundaries
.
start
);
#if CV_S
SE2
bool
haveS
SE2
=
checkHardwareSupport
(
CV_CPU_SSE2
);
#if CV_S
IMD128
bool
haveS
IMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
#endif
for
(
int
i
=
boundaries
.
start
;
i
<
boundaries
.
end
;
i
++
,
pmap
+=
mapstep
,
pdst
+=
dst
.
step
)
{
int
j
=
0
;
#if CV_S
SE2
if
(
haveS
SE2
)
{
const
__m128i
v_zero
=
_mm_setzero_si12
8
();
#if CV_S
IMD128
if
(
haveS
IMD
)
{
const
v_int8x16
v_zero
=
v_setzero_s
8
();
for
(;
j
<=
dst
.
cols
-
32
;
j
+=
32
)
{
__m128i
v_pmap1
=
_mm_loadu_si128
((
const
__m128i
*
)(
pmap
+
j
));
__m128i
v_pmap2
=
_mm_loadu_si128
((
const
__m128i
*
)(
pmap
+
j
+
16
));
v_uint8x16
v_pmap1
=
v_load
((
const
unsigned
char
*
)(
pmap
+
j
));
v_uint8x16
v_pmap2
=
v_load
((
const
unsigned
char
*
)(
pmap
+
j
+
16
));
__m128i
v_pmaplo1
=
_mm_unpacklo_epi8
(
v_pmap1
,
v_zero
);
__m128i
v_pmaphi1
=
_mm_unpackhi_epi8
(
v_pmap1
,
v_zero
);
__m128i
v_pmaplo2
=
_mm_unpacklo_epi8
(
v_pmap2
,
v_zero
);
__m128i
v_pmaphi2
=
_mm_unpackhi_epi8
(
v_pmap2
,
v_zero
);
v_uint16x8
v_pmaplo1
;
v_uint16x8
v_pmaphi1
;
v_uint16x8
v_pmaplo2
;
v_uint16x8
v_pmaphi2
;
v_expand
(
v_pmap1
,
v_pmaplo1
,
v_pmaphi1
);
v_expand
(
v_pmap2
,
v_pmaplo2
,
v_pmaphi2
);
v_pmaplo1
=
_mm_srli_epi16
(
v_pmaplo1
,
1
)
;
v_pmaphi1
=
_mm_srli_epi16
(
v_pmaphi1
,
1
)
;
v_pmaplo2
=
_mm_srli_epi16
(
v_pmaplo2
,
1
)
;
v_pmaphi2
=
_mm_srli_epi16
(
v_pmaphi2
,
1
)
;
v_pmaplo1
=
v_pmaplo1
>>
1
;
v_pmaphi1
=
v_pmaphi1
>>
1
;
v_pmaplo2
=
v_pmaplo2
>>
1
;
v_pmaphi2
=
v_pmaphi2
>>
1
;
v_pmap1
=
_mm_packus_epi16
(
v_pmaplo1
,
v_pmaphi1
);
v_pmap2
=
_mm_packus_epi16
(
v_pmaplo2
,
v_pmaphi2
);
v_pmap1
=
v_pack
(
v_pmaplo1
,
v_pmaphi1
);
v_pmap2
=
v_pack
(
v_pmaplo2
,
v_pmaphi2
);
v_pmap1
=
_mm_sub_epi8
(
v_zero
,
v_pmap1
);
v_pmap2
=
_mm_sub_epi8
(
v_zero
,
v_pmap2
);
v_pmap1
=
v_reinterpret_as_u8
(
v_zero
-
v_reinterpret_as_s8
(
v_pmap1
)
);
v_pmap2
=
v_reinterpret_as_u8
(
v_zero
-
v_reinterpret_as_s8
(
v_pmap2
)
);
_mm_storeu_si128
((
__m128i
*
)
(
pdst
+
j
),
v_pmap1
);
_mm_storeu_si128
((
__m128i
*
)
(
pdst
+
j
+
16
),
v_pmap2
);
v_store
(
(
pdst
+
j
),
v_pmap1
);
v_store
(
(
pdst
+
j
+
16
),
v_pmap2
);
}
for
(;
j
<=
dst
.
cols
-
16
;
j
+=
16
)
{
__m128i
v_pmap
=
_mm_loadu_si128
((
const
__m128i
*
)(
pmap
+
j
));
v_uint8x16
v_pmap
=
v_load
((
const
unsigned
char
*
)(
pmap
+
j
));
__m128i
v_pmaplo
=
_mm_unpacklo_epi8
(
v_pmap
,
v_zero
);
__m128i
v_pmaphi
=
_mm_unpackhi_epi8
(
v_pmap
,
v_zero
);
v_uint16x8
v_pmaplo
;
v_uint16x8
v_pmaphi
;
v_expand
(
v_pmap
,
v_pmaplo
,
v_pmaphi
);
v_pmaplo
=
_mm_srli_epi16
(
v_pmaplo
,
1
)
;
v_pmaphi
=
_mm_srli_epi16
(
v_pmaphi
,
1
)
;
v_pmaplo
=
v_pmaplo
>>
1
;
v_pmaphi
=
v_pmaphi
>>
1
;
v_pmap
=
_mm_packus_epi16
(
v_pmaplo
,
v_pmaphi
);
v_pmap
=
_mm_sub_epi8
(
v_zero
,
v_pmap
);
v_pmap
=
v_pack
(
v_pmaplo
,
v_pmaphi
);
v_pmap
=
v_reinterpret_as_u8
(
v_zero
-
v_reinterpret_as_s8
(
v_pmap
)
);
_mm_storeu_si128
((
__m128i
*
)
(
pdst
+
j
),
v_pmap
);
v_store
(
(
pdst
+
j
),
v_pmap
);
}
}
#endif
...
...
@@ -980,8 +961,8 @@ static void CannyImpl(Mat& dx, Mat& dy, Mat& dst,
#define CANNY_PUSH(d) *(d) = uchar(2), *stack_top++ = (d)
#define CANNY_POP(d) (d) = *--stack_top
#if CV_S
SE2
bool
haveS
SE2
=
checkHardwareSupport
(
CV_CPU_SSE2
);
#if CV_S
IMD128
bool
haveS
IMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
#endif
// calculate magnitude and angle of gradient, perform non-maxima suppression.
...
...
@@ -1000,32 +981,26 @@ static void CannyImpl(Mat& dx, Mat& dy, Mat& dst,
if
(
!
L2gradient
)
{
int
j
=
0
,
width
=
cols
*
cn
;
#if CV_S
SE2
if
(
haveS
SE2
)
#if CV_S
IMD128
if
(
haveS
IMD
)
{
__m128i
v_zero
=
_mm_setzero_si128
();
for
(
;
j
<=
width
-
8
;
j
+=
8
)
{
__m128i
v_dx
=
_mm_loadu_si128
((
const
__m128i
*
)(
_dx
+
j
));
__m128i
v_dy
=
_mm_loadu_si128
((
const
__m128i
*
)(
_dy
+
j
));
v_dx
=
_mm_max_epi16
(
v_dx
,
_mm_sub_epi16
(
v_zero
,
v_dx
));
v_dy
=
_mm_max_epi16
(
v_dy
,
_mm_sub_epi16
(
v_zero
,
v_dy
));
v_int16x8
v_dx
=
v_load
((
const
short
*
)(
_dx
+
j
));
v_int16x8
v_dy
=
v_load
((
const
short
*
)(
_dy
+
j
));
__m128i
v_norm
=
_mm_add_epi32
(
_mm_unpacklo_epi16
(
v_dx
,
v_zero
),
_mm_unpacklo_epi16
(
v_dy
,
v_zero
));
_mm_storeu_si128
((
__m128i
*
)(
_norm
+
j
),
v_norm
);
v_int32x4
v_dx0
,
v_dx1
,
v_dy0
,
v_dy1
;
v_expand
(
v_dx
,
v_dx0
,
v_dx1
);
v_expand
(
v_dy
,
v_dy0
,
v_dy1
);
v_norm
=
_mm_add_epi32
(
_mm_unpackhi_epi16
(
v_dx
,
v_zero
),
_mm_unpackhi_epi16
(
v_dy
,
v_zero
));
_mm_storeu_si128
((
__m128i
*
)(
_norm
+
j
+
4
),
v_norm
);
}
v_dx0
=
v_reinterpret_as_s32
(
v_abs
(
v_dx0
));
v_dx1
=
v_reinterpret_as_s32
(
v_abs
(
v_dx1
));
v_dy0
=
v_reinterpret_as_s32
(
v_abs
(
v_dy0
));
v_dy1
=
v_reinterpret_as_s32
(
v_abs
(
v_dy1
));
v_store
(
_norm
+
j
,
v_dx0
+
v_dy0
);
v_store
(
_norm
+
j
+
4
,
v_dx1
+
v_dy1
);
}
#elif CV_NEON
for
(
;
j
<=
width
-
8
;
j
+=
8
)
{
int16x8_t
v_dx
=
vld1q_s16
(
_dx
+
j
),
v_dy
=
vld1q_s16
(
_dy
+
j
);
vst1q_s32
(
_norm
+
j
,
vaddq_s32
(
vabsq_s32
(
vmovl_s16
(
vget_low_s16
(
v_dx
))),
vabsq_s32
(
vmovl_s16
(
vget_low_s16
(
v_dy
)))));
vst1q_s32
(
_norm
+
j
+
4
,
vaddq_s32
(
vabsq_s32
(
vmovl_s16
(
vget_high_s16
(
v_dx
))),
vabsq_s32
(
vmovl_s16
(
vget_high_s16
(
v_dy
)))));
}
#endif
for
(
;
j
<
width
;
++
j
)
...
...
@@ -1034,33 +1009,23 @@ static void CannyImpl(Mat& dx, Mat& dy, Mat& dst,
else
{
int
j
=
0
,
width
=
cols
*
cn
;
#if CV_S
SE2
if
(
haveS
SE2
)
#if CV_S
IMD128
if
(
haveS
IMD
)
{
for
(
;
j
<=
width
-
8
;
j
+=
8
)
{
__m128i
v_dx
=
_mm_loadu_si128
((
const
__m128i
*
)(
_dx
+
j
));
__m128i
v_dy
=
_mm_loadu_si128
((
const
__m128i
*
)(
_dy
+
j
));
v_int16x8
v_dx
=
v_load
((
const
short
*
)(
_dx
+
j
));
v_int16x8
v_dy
=
v_load
((
const
short
*
)(
_dy
+
j
));
__m128i
v_dx_dy_ml
=
_mm_unpacklo_epi16
(
v_dx
,
v_dy
);
__m128i
v_dx_dy_mh
=
_mm_unpackhi_epi16
(
v_dx
,
v_dy
);
__m128i
v_norm_ml
=
_mm_madd_epi16
(
v_dx_dy_ml
,
v_dx_dy_ml
);
__m128i
v_norm_mh
=
_mm_madd_epi16
(
v_dx_dy_mh
,
v_dx_dy_mh
);
_mm_storeu_si128
((
__m128i
*
)(
_norm
+
j
),
v_norm_ml
);
_mm_storeu_si128
((
__m128i
*
)(
_norm
+
j
+
4
),
v_norm_mh
);
}
}
#elif CV_NEON
for
(
;
j
<=
width
-
8
;
j
+=
8
)
{
int16x8_t
v_dx
=
vld1q_s16
(
_dx
+
j
),
v_dy
=
vld1q_s16
(
_dy
+
j
);
int16x4_t
v_dxp
=
vget_low_s16
(
v_dx
),
v_dyp
=
vget_low_s16
(
v_dy
);
int32x4_t
v_dst
=
vmlal_s16
(
vmull_s16
(
v_dxp
,
v_dxp
),
v_dyp
,
v_dyp
);
vst1q_s32
(
_norm
+
j
,
v_dst
);
v_int16x8
v_dx_dy0
,
v_dx_dy1
;
v_zip
(
v_dx
,
v_dy
,
v_dx_dy0
,
v_dx_dy1
);
v_dxp
=
vget_high_s16
(
v_dx
),
v_dyp
=
vget_high_s16
(
v_dy
);
v_dst
=
vmlal_s16
(
vmull_s16
(
v_dxp
,
v_dxp
),
v_dyp
,
v_dyp
);
vst1q_s32
(
_norm
+
j
+
4
,
v_dst
);
v_int32x4
v_dst0
=
v_dotprod
(
v_dx_dy0
,
v_dx_dy0
);
v_int32x4
v_dst1
=
v_dotprod
(
v_dx_dy1
,
v_dx_dy1
);
v_store
(
_norm
+
j
,
v_dst0
);
v_store
(
_norm
+
j
+
4
,
v_dst1
);
}
}
#endif
for
(
;
j
<
width
;
++
j
)
...
...
@@ -1112,30 +1077,31 @@ static void CannyImpl(Mat& dx, Mat& dy, Mat& dst,
const
int
TG22
=
(
int
)(
0.4142135623730950488016887242097
*
(
1
<<
CANNY_SHIFT
)
+
0.5
);
int
prev_flag
=
0
,
j
=
0
;
#if CV_S
SE2
if
(
checkHardwareSupport
(
CPU_SSE2
)
)
#if CV_S
IMD128
if
(
haveSIMD
)
{
__m128i
v_low
=
_mm_set1_epi32
(
low
),
v_one
=
_mm_set1_epi8
(
1
);
v_int32x4
v_low
=
v_setall_s32
(
low
);
v_int8x16
v_one
=
v_setall_s8
(
1
);
for
(;
j
<=
cols
-
16
;
j
+=
16
)
{
__m128i
v_m1
=
_mm_loadu_si128
((
const
__m128i
*
)(
_mag
+
j
));
__m128i
v_m2
=
_mm_loadu_si128
((
const
__m128i
*
)(
_mag
+
j
+
4
));
__m128i
v_m3
=
_mm_loadu_si128
((
const
__m128i
*
)(
_mag
+
j
+
8
));
__m128i
v_m4
=
_mm_loadu_si128
((
const
__m128i
*
)(
_mag
+
j
+
12
));
v_int32x4
v_m1
=
v_load
((
const
int
*
)(
_mag
+
j
));
v_int32x4
v_m2
=
v_load
((
const
int
*
)(
_mag
+
j
+
4
));
v_int32x4
v_m3
=
v_load
((
const
int
*
)(
_mag
+
j
+
8
));
v_int32x4
v_m4
=
v_load
((
const
int
*
)(
_mag
+
j
+
12
));
_mm_storeu_si128
((
__m128i
*
)(
_map
+
j
),
v_one
);
v_store
((
signed
char
*
)(
_map
+
j
),
v_one
);
__m128i
v_cmp1
=
_mm_cmpgt_epi32
(
v_m1
,
v_low
)
;
__m128i
v_cmp2
=
_mm_cmpgt_epi32
(
v_m2
,
v_low
)
;
__m128i
v_cmp3
=
_mm_cmpgt_epi32
(
v_m3
,
v_low
)
;
__m128i
v_cmp4
=
_mm_cmpgt_epi32
(
v_m4
,
v_low
)
;
v_int32x4
v_cmp1
=
v_m1
>
v_low
;
v_int32x4
v_cmp2
=
v_m2
>
v_low
;
v_int32x4
v_cmp3
=
v_m3
>
v_low
;
v_int32x4
v_cmp4
=
v_m4
>
v_low
;
v_
cmp1
=
_mm_packs_epi32
(
v_cmp1
,
v_cmp2
);
v_
cmp2
=
_mm_packs_epi32
(
v_cmp3
,
v_cmp4
);
v_
int16x8
v_cmp80
=
v_pack
(
v_cmp1
,
v_cmp2
);
v_
int16x8
v_cmp81
=
v_pack
(
v_cmp3
,
v_cmp4
);
v_
cmp1
=
_mm_packs_epi16
(
v_cmp1
,
v_cmp2
);
unsigned
int
mask
=
_mm_movemask_epi8
(
v_cmp1
);
v_
int8x16
v_cmp
=
v_pack
(
v_cmp80
,
v_cmp81
);
unsigned
int
mask
=
v_signmask
(
v_cmp
);
if
(
mask
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment