Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
062d2179
Commit
062d2179
authored
Jan 20, 2017
by
Tomoaki Teshima
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
use universal intrinsic in corner detection series
parent
a22f03e7
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
53 additions
and
130 deletions
+53
-130
corner.cpp
modules/imgproc/src/corner.cpp
+53
-130
No files found.
modules/imgproc/src/corner.cpp
View file @
062d2179
...
...
@@ -43,6 +43,7 @@
#include "precomp.hpp"
#include "opencl_kernels_imgproc.hpp"
#include "opencv2/core/hal/intrin.hpp"
namespace
cv
{
...
...
@@ -51,8 +52,8 @@ static void calcMinEigenVal( const Mat& _cov, Mat& _dst )
{
int
i
,
j
;
Size
size
=
_cov
.
size
();
#if CV_S
SE
volatile
bool
simd
=
checkHardwareSupport
(
CV_CPU_SSE
);
#if CV_S
IMD128
bool
simd
=
hasSIMD128
(
);
#endif
if
(
_cov
.
isContinuous
()
&&
_dst
.
isContinuous
()
)
...
...
@@ -66,44 +67,21 @@ static void calcMinEigenVal( const Mat& _cov, Mat& _dst )
const
float
*
cov
=
_cov
.
ptr
<
float
>
(
i
);
float
*
dst
=
_dst
.
ptr
<
float
>
(
i
);
j
=
0
;
#if CV_S
SE
#if CV_S
IMD128
if
(
simd
)
{
__m128
half
=
_mm_set1_ps
(
0.5
f
);
for
(
;
j
<=
size
.
width
-
4
;
j
+=
4
)
v_float32x4
half
=
v_setall_f32
(
0.5
f
);
for
(
;
j
<=
size
.
width
-
v_float32x4
::
nlanes
;
j
+=
v_float32x4
::
nlanes
)
{
__m128
t0
=
_mm_loadu_ps
(
cov
+
j
*
3
);
// a0 b0 c0 x
__m128
t1
=
_mm_loadu_ps
(
cov
+
j
*
3
+
3
);
// a1 b1 c1 x
__m128
t2
=
_mm_loadu_ps
(
cov
+
j
*
3
+
6
);
// a2 b2 c2 x
__m128
t3
=
_mm_loadu_ps
(
cov
+
j
*
3
+
9
);
// a3 b3 c3 x
__m128
a
,
b
,
c
,
t
;
t
=
_mm_unpacklo_ps
(
t0
,
t1
);
// a0 a1 b0 b1
c
=
_mm_unpackhi_ps
(
t0
,
t1
);
// c0 c1 x x
b
=
_mm_unpacklo_ps
(
t2
,
t3
);
// a2 a3 b2 b3
c
=
_mm_movelh_ps
(
c
,
_mm_unpackhi_ps
(
t2
,
t3
));
// c0 c1 c2 c3
a
=
_mm_movelh_ps
(
t
,
b
);
b
=
_mm_movehl_ps
(
b
,
t
);
a
=
_mm_mul_ps
(
a
,
half
);
c
=
_mm_mul_ps
(
c
,
half
);
t
=
_mm_sub_ps
(
a
,
c
);
t
=
_mm_add_ps
(
_mm_mul_ps
(
t
,
t
),
_mm_mul_ps
(
b
,
b
));
a
=
_mm_sub_ps
(
_mm_add_ps
(
a
,
c
),
_mm_sqrt_ps
(
t
));
_mm_storeu_ps
(
dst
+
j
,
a
);
v_float32x4
v_a
,
v_b
,
v_c
,
v_t
;
v_load_deinterleave
(
cov
+
j
*
3
,
v_a
,
v_b
,
v_c
);
v_a
*=
half
;
v_c
*=
half
;
v_t
=
v_a
-
v_c
;
v_t
=
v_muladd
(
v_b
,
v_b
,
(
v_t
*
v_t
));
v_store
(
dst
+
j
,
(
v_a
+
v_c
)
-
v_sqrt
(
v_t
));
}
}
#elif CV_NEON
float32x4_t
v_half
=
vdupq_n_f32
(
0.5
f
);
for
(
;
j
<=
size
.
width
-
4
;
j
+=
4
)
{
float32x4x3_t
v_src
=
vld3q_f32
(
cov
+
j
*
3
);
float32x4_t
v_a
=
vmulq_f32
(
v_src
.
val
[
0
],
v_half
);
float32x4_t
v_b
=
v_src
.
val
[
1
];
float32x4_t
v_c
=
vmulq_f32
(
v_src
.
val
[
2
],
v_half
);
float32x4_t
v_t
=
vsubq_f32
(
v_a
,
v_c
);
v_t
=
vmlaq_f32
(
vmulq_f32
(
v_t
,
v_t
),
v_b
,
v_b
);
vst1q_f32
(
dst
+
j
,
vsubq_f32
(
vaddq_f32
(
v_a
,
v_c
),
cv_vsqrtq_f32
(
v_t
)));
}
#endif
for
(
;
j
<
size
.
width
;
j
++
)
{
...
...
@@ -120,8 +98,8 @@ static void calcHarris( const Mat& _cov, Mat& _dst, double k )
{
int
i
,
j
;
Size
size
=
_cov
.
size
();
#if CV_S
SE
volatile
bool
simd
=
checkHardwareSupport
(
CV_CPU_SSE
);
#if CV_S
IMD128
bool
simd
=
hasSIMD128
(
);
#endif
if
(
_cov
.
isContinuous
()
&&
_dst
.
isContinuous
()
)
...
...
@@ -136,40 +114,21 @@ static void calcHarris( const Mat& _cov, Mat& _dst, double k )
float
*
dst
=
_dst
.
ptr
<
float
>
(
i
);
j
=
0
;
#if CV_S
SE
#if CV_S
IMD128
if
(
simd
)
{
__m128
k4
=
_mm_set1_ps
((
float
)
k
);
for
(
;
j
<=
size
.
width
-
4
;
j
+=
4
)
v_float32x4
v_k
=
v_setall_f32
((
float
)
k
);
for
(
;
j
<=
size
.
width
-
v_float32x4
::
nlanes
;
j
+=
v_float32x4
::
nlanes
)
{
__m128
t0
=
_mm_loadu_ps
(
cov
+
j
*
3
);
// a0 b0 c0 x
__m128
t1
=
_mm_loadu_ps
(
cov
+
j
*
3
+
3
);
// a1 b1 c1 x
__m128
t2
=
_mm_loadu_ps
(
cov
+
j
*
3
+
6
);
// a2 b2 c2 x
__m128
t3
=
_mm_loadu_ps
(
cov
+
j
*
3
+
9
);
// a3 b3 c3 x
__m128
a
,
b
,
c
,
t
;
t
=
_mm_unpacklo_ps
(
t0
,
t1
);
// a0 a1 b0 b1
c
=
_mm_unpackhi_ps
(
t0
,
t1
);
// c0 c1 x x
b
=
_mm_unpacklo_ps
(
t2
,
t3
);
// a2 a3 b2 b3
c
=
_mm_movelh_ps
(
c
,
_mm_unpackhi_ps
(
t2
,
t3
));
// c0 c1 c2 c3
a
=
_mm_movelh_ps
(
t
,
b
);
b
=
_mm_movehl_ps
(
b
,
t
);
t
=
_mm_add_ps
(
a
,
c
);
a
=
_mm_sub_ps
(
_mm_mul_ps
(
a
,
c
),
_mm_mul_ps
(
b
,
b
));
t
=
_mm_mul_ps
(
_mm_mul_ps
(
k4
,
t
),
t
);
a
=
_mm_sub_ps
(
a
,
t
);
_mm_storeu_ps
(
dst
+
j
,
a
);
}
}
#elif CV_NEON
float32x4_t
v_k
=
vdupq_n_f32
((
float
)
k
);
v_float32x4
v_a
,
v_b
,
v_c
;
v_load_deinterleave
(
cov
+
j
*
3
,
v_a
,
v_b
,
v_c
);
for
(
;
j
<=
size
.
width
-
4
;
j
+=
4
)
{
float32x4x3_t
v_src
=
vld3q_f32
(
cov
+
j
*
3
);
float32x4_t
v_a
=
v_src
.
val
[
0
],
v_b
=
v_src
.
val
[
1
],
v_c
=
v_src
.
val
[
2
];
float32x4_t
v_ac_bb
=
vmlsq_f32
(
vmulq_f32
(
v_a
,
v_c
),
v_b
,
v_b
);
float32x4_t
v_ac
=
vaddq_f32
(
v_a
,
v_c
);
vst1q_f32
(
dst
+
j
,
vmlsq_f32
(
v_ac_bb
,
v_k
,
vmulq_f32
(
v_ac
,
v_ac
)));
v_float32x4
v_ac_bb
=
v_a
*
v_c
-
v_b
*
v_b
;
v_float32x4
v_ac
=
v_a
+
v_c
;
v_float32x4
v_dst
=
v_ac_bb
-
v_k
*
v_ac
*
v_ac
;
v_store
(
dst
+
j
,
v_dst
);
}
}
#endif
...
...
@@ -272,8 +231,8 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size,
if
(
tegra
::
useTegra
()
&&
tegra
::
cornerEigenValsVecs
(
src
,
eigenv
,
block_size
,
aperture_size
,
op_type
,
k
,
borderType
))
return
;
#endif
#if CV_S
SE2
bool
haveS
SE2
=
checkHardwareSupport
(
CV_CPU_SSE2
);
#if CV_S
IMD128
bool
haveS
imd
=
hasSIMD128
(
);
#endif
int
depth
=
src
.
depth
();
...
...
@@ -309,44 +268,20 @@ cornerEigenValsVecs( const Mat& src, Mat& eigenv, int block_size,
const
float
*
dydata
=
Dy
.
ptr
<
float
>
(
i
);
j
=
0
;
#if CV_
NEON
for
(
;
j
<=
size
.
width
-
4
;
j
+=
4
)
#if CV_
SIMD128
if
(
haveSimd
)
{
float32x4_t
v_dx
=
vld1q_f32
(
dxdata
+
j
);
float32x4_t
v_dy
=
vld1q_f32
(
dydata
+
j
);
for
(
;
j
<=
size
.
width
-
v_float32x4
::
nlanes
;
j
+=
v_float32x4
::
nlanes
)
{
v_float32x4
v_dx
=
v_load
(
dxdata
+
j
);
v_float32x4
v_dy
=
v_load
(
dydata
+
j
);
float32x4x3_t
v_dst
;
v_dst
.
val
[
0
]
=
vmulq_f32
(
v_dx
,
v_dx
)
;
v_dst
.
val
[
1
]
=
vmulq_f32
(
v_dx
,
v_dy
)
;
v_dst
.
val
[
2
]
=
vmulq_f32
(
v_dy
,
v_dy
)
;
v_float32x4
v_dst0
,
v_dst1
,
v_dst2
;
v_dst0
=
v_dx
*
v_dx
;
v_dst1
=
v_dx
*
v_dy
;
v_dst2
=
v_dy
*
v_dy
;
vst3q_f32
(
cov_data
+
j
*
3
,
v_dst
);
}
#elif CV_SSE2
if
(
haveSSE2
)
{
for
(
;
j
<=
size
.
width
-
8
;
j
+=
8
)
{
__m128
v_dx_0
=
_mm_loadu_ps
(
dxdata
+
j
);
__m128
v_dx_1
=
_mm_loadu_ps
(
dxdata
+
j
+
4
);
__m128
v_dy_0
=
_mm_loadu_ps
(
dydata
+
j
);
__m128
v_dy_1
=
_mm_loadu_ps
(
dydata
+
j
+
4
);
__m128
v_dx2_0
=
_mm_mul_ps
(
v_dx_0
,
v_dx_0
);
__m128
v_dxy_0
=
_mm_mul_ps
(
v_dx_0
,
v_dy_0
);
__m128
v_dy2_0
=
_mm_mul_ps
(
v_dy_0
,
v_dy_0
);
__m128
v_dx2_1
=
_mm_mul_ps
(
v_dx_1
,
v_dx_1
);
__m128
v_dxy_1
=
_mm_mul_ps
(
v_dx_1
,
v_dy_1
);
__m128
v_dy2_1
=
_mm_mul_ps
(
v_dy_1
,
v_dy_1
);
_mm_interleave_ps
(
v_dx2_0
,
v_dx2_1
,
v_dxy_0
,
v_dxy_1
,
v_dy2_0
,
v_dy2_1
);
_mm_storeu_ps
(
cov_data
+
j
*
3
,
v_dx2_0
);
_mm_storeu_ps
(
cov_data
+
j
*
3
+
4
,
v_dx2_1
);
_mm_storeu_ps
(
cov_data
+
j
*
3
+
8
,
v_dxy_0
);
_mm_storeu_ps
(
cov_data
+
j
*
3
+
12
,
v_dxy_1
);
_mm_storeu_ps
(
cov_data
+
j
*
3
+
16
,
v_dy2_0
);
_mm_storeu_ps
(
cov_data
+
j
*
3
+
20
,
v_dy2_1
);
v_store_interleave
(
cov_data
+
j
*
3
,
v_dst0
,
v_dst1
,
v_dst2
);
}
}
#endif
...
...
@@ -751,13 +686,10 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord
if
(
src
.
depth
()
==
CV_8U
)
factor
*=
255
;
factor
=
1.
/
(
factor
*
factor
*
factor
);
#if CV_
NEON || CV_SSE2
#if CV_
SIMD128
float
factor_f
=
(
float
)
factor
;
#endif
#if CV_SSE2
volatile
bool
haveSSE2
=
cv
::
checkHardwareSupport
(
CV_CPU_SSE2
);
__m128
v_factor
=
_mm_set1_ps
(
factor_f
),
v_m2
=
_mm_set1_ps
(
-
2.0
f
);
bool
haveSimd
=
hasSIMD128
();
v_float32x4
v_factor
=
v_setall_f32
(
factor_f
),
v_m2
=
v_setall_f32
(
-
2.0
f
);
#endif
Size
size
=
src
.
size
();
...
...
@@ -773,30 +705,21 @@ void cv::preCornerDetect( InputArray _src, OutputArray _dst, int ksize, int bord
j
=
0
;
#if CV_S
SE2
if
(
haveS
SE2
)
#if CV_S
IMD128
if
(
haveS
imd
)
{
for
(
;
j
<=
size
.
width
-
4
;
j
+=
4
)
for
(
;
j
<=
size
.
width
-
v_float32x4
::
nlanes
;
j
+=
v_float32x4
::
nlanes
)
{
__m128
v_dx
=
_mm_loadu_ps
((
const
float
*
)(
dxdata
+
j
)
);
__m128
v_dy
=
_mm_loadu_ps
((
const
float
*
)(
dydata
+
j
)
);
__m128
v_s1
=
_mm_mul_ps
(
_mm_mul_ps
(
v_dx
,
v_dx
),
_mm_loadu_ps
((
const
float
*
)(
d2ydata
+
j
))
);
__m128
v_s2
=
_mm_mul_ps
(
_mm_mul_ps
(
v_dy
,
v_dy
),
_mm_loadu_ps
((
const
float
*
)(
d2xdata
+
j
))
);
__m128
v_s3
=
_mm_mul_ps
(
_mm_mul_ps
(
v_dx
,
v_dy
),
_mm_loadu_ps
((
const
float
*
)(
dxydata
+
j
))
);
v_s1
=
_mm_mul_ps
(
v_factor
,
_mm_add_ps
(
v_s1
,
_mm_add_ps
(
v_s2
,
_mm_mul_ps
(
v_s3
,
v_m2
))));
_mm_storeu_ps
(
dstdata
+
j
,
v_s1
);
v_float32x4
v_dx
=
v_load
(
dxdata
+
j
);
v_float32x4
v_dy
=
v_load
(
dydata
+
j
);
v_float32x4
v_s1
=
(
v_dx
*
v_dx
)
*
v_load
(
d2ydata
+
j
);
v_float32x4
v_s2
=
v_muladd
((
v_dy
*
v_dy
),
v_load
(
d2xdata
+
j
),
v_s1
);
v_float32x4
v_s3
=
v_muladd
((
v_dy
*
v_dx
)
*
v_load
(
dxydata
+
j
),
v_m2
,
v_s2
);
v_store
(
dstdata
+
j
,
v_s3
*
v_factor
);
}
}
#elif CV_NEON
for
(
;
j
<=
size
.
width
-
4
;
j
+=
4
)
{
float32x4_t
v_dx
=
vld1q_f32
(
dxdata
+
j
),
v_dy
=
vld1q_f32
(
dydata
+
j
);
float32x4_t
v_s
=
vmulq_f32
(
v_dx
,
vmulq_f32
(
v_dx
,
vld1q_f32
(
d2ydata
+
j
)));
v_s
=
vmlaq_f32
(
v_s
,
vld1q_f32
(
d2xdata
+
j
),
vmulq_f32
(
v_dy
,
v_dy
));
v_s
=
vmlaq_f32
(
v_s
,
vld1q_f32
(
dxydata
+
j
),
vmulq_n_f32
(
vmulq_f32
(
v_dy
,
v_dx
),
-
2
));
vst1q_f32
(
dstdata
+
j
,
vmulq_n_f32
(
v_s
,
factor_f
));
}
#endif
for
(
;
j
<
size
.
width
;
j
++
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment