Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
1ef740fa
Commit
1ef740fa
authored
Oct 11, 2016
by
Tomoaki Teshima
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
use universal intrinsic implementation for calcSharrDeriv
parent
0f03f692
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
38 additions
and
74 deletions
+38
-74
intrin_sse.hpp
modules/core/include/opencv2/core/hal/intrin_sse.hpp
+9
-0
lkpyramid.cpp
modules/video/src/lkpyramid.cpp
+29
-74
No files found.
modules/core/include/opencv2/core/hal/intrin_sse.hpp
View file @
1ef740fa
...
...
@@ -1425,6 +1425,15 @@ inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b
b
.
val
=
_mm_shuffle_ps
(
u0
,
u1
,
mask_hi
);
// b0 b1 ab b3
}
inline
void
v_store_interleave
(
short
*
ptr
,
const
v_int16x8
&
a
,
const
v_int16x8
&
b
)
{
__m128i
t0
,
t1
;
t0
=
_mm_unpacklo_epi16
(
a
.
val
,
b
.
val
);
t1
=
_mm_unpackhi_epi16
(
a
.
val
,
b
.
val
);
_mm_storeu_si128
((
__m128i
*
)(
ptr
),
t0
);
_mm_storeu_si128
((
__m128i
*
)(
ptr
+
8
),
t1
);
}
inline
void
v_store_interleave
(
uchar
*
ptr
,
const
v_uint8x16
&
a
,
const
v_uint8x16
&
b
,
const
v_uint8x16
&
c
)
{
...
...
modules/video/src/lkpyramid.cpp
View file @
1ef740fa
...
...
@@ -44,6 +44,7 @@
#include <stdio.h>
#include "lkpyramid.hpp"
#include "opencl_kernels_video.hpp"
#include "opencv2/core/hal/intrin.hpp"
#define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
...
...
@@ -66,16 +67,9 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
AutoBuffer
<
deriv_type
>
_tempBuf
(
delta
*
2
+
64
);
deriv_type
*
trow0
=
alignPtr
(
_tempBuf
+
cn
,
16
),
*
trow1
=
alignPtr
(
trow0
+
delta
,
16
);
#if CV_SSE2
__m128i
z
=
_mm_setzero_si128
(),
c3
=
_mm_set1_epi16
(
3
),
c10
=
_mm_set1_epi16
(
10
);
#endif
#if CV_NEON
const
uint16x8_t
q8
=
vdupq_n_u16
(
3
);
const
uint8x8_t
d18
=
vdup_n_u8
(
10
);
const
int16x8_t
q8i
=
vdupq_n_s16
(
3
);
const
int16x8_t
q9
=
vdupq_n_s16
(
10
);
#if CV_SIMD128
v_int16x8
c3
=
v_setall_s16
(
3
),
c10
=
v_setall_s16
(
10
);
bool
haveSIMD
=
checkHardwareSupport
(
CV_CPU_SSE2
)
||
checkHardwareSupport
(
CV_CPU_NEON
);
#endif
for
(
y
=
0
;
y
<
rows
;
y
++
)
...
...
@@ -87,33 +81,21 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
// do vertical convolution
x
=
0
;
#if CV_S
SE2
for
(
;
x
<=
colsn
-
8
;
x
+=
8
)
#if CV_S
IMD128
if
(
haveSIMD
)
{
__m128i
s0
=
_mm_unpacklo_epi8
(
_mm_loadl_epi64
((
const
__m128i
*
)(
srow0
+
x
)),
z
);
__m128i
s1
=
_mm_unpacklo_epi8
(
_mm_loadl_epi64
((
const
__m128i
*
)(
srow1
+
x
)),
z
);
__m128i
s2
=
_mm_unpacklo_epi8
(
_mm_loadl_epi64
((
const
__m128i
*
)(
srow2
+
x
)),
z
);
__m128i
t0
=
_mm_add_epi16
(
_mm_mullo_epi16
(
_mm_add_epi16
(
s0
,
s2
),
c3
),
_mm_mullo_epi16
(
s1
,
c10
));
__m128i
t1
=
_mm_sub_epi16
(
s2
,
s0
);
_mm_store_si128
((
__m128i
*
)(
trow0
+
x
),
t0
);
_mm_store_si128
((
__m128i
*
)(
trow1
+
x
),
t1
);
}
#endif
for
(
;
x
<=
colsn
-
8
;
x
+=
8
)
{
v_int16x8
s0
=
v_reinterpret_as_s16
(
v_load_expand
(
srow0
+
x
));
v_int16x8
s1
=
v_reinterpret_as_s16
(
v_load_expand
(
srow1
+
x
));
v_int16x8
s2
=
v_reinterpret_as_s16
(
v_load_expand
(
srow2
+
x
));
#if CV_NEON
for
(
;
x
<=
colsn
-
8
;
x
+=
8
)
{
uint8x8_t
d0
=
vld1_u8
((
const
uint8_t
*
)
&
srow0
[
x
]);
uint8x8_t
d1
=
vld1_u8
((
const
uint8_t
*
)
&
srow1
[
x
]);
uint8x8_t
d2
=
vld1_u8
((
const
uint8_t
*
)
&
srow2
[
x
]);
uint16x8_t
q4
=
vaddl_u8
(
d0
,
d2
);
uint16x8_t
q11
=
vsubl_u8
(
d2
,
d0
);
uint16x8_t
q5
=
vmulq_u16
(
q4
,
q8
);
uint16x8_t
q6
=
vmull_u8
(
d1
,
d18
);
uint16x8_t
q10
=
vaddq_u16
(
q6
,
q5
);
vst1q_u16
((
uint16_t
*
)
&
trow0
[
x
],
q10
);
vst1q_u16
((
uint16_t
*
)
&
trow1
[
x
],
q11
);
v_int16x8
t1
=
s2
-
s0
;
v_int16x8
t0
=
(
s0
+
s2
)
*
c3
+
s1
*
c10
;
v_store
(
trow0
+
x
,
t0
);
v_store
(
trow1
+
x
,
t1
);
}
}
#endif
...
...
@@ -135,49 +117,22 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
// do horizontal convolution, interleave the results and store them to dst
x
=
0
;
#if CV_SSE2
for
(
;
x
<=
colsn
-
8
;
x
+=
8
)
{
__m128i
s0
=
_mm_loadu_si128
((
const
__m128i
*
)(
trow0
+
x
-
cn
));
__m128i
s1
=
_mm_loadu_si128
((
const
__m128i
*
)(
trow0
+
x
+
cn
));
__m128i
s2
=
_mm_loadu_si128
((
const
__m128i
*
)(
trow1
+
x
-
cn
));
__m128i
s3
=
_mm_load_si128
((
const
__m128i
*
)(
trow1
+
x
));
__m128i
s4
=
_mm_loadu_si128
((
const
__m128i
*
)(
trow1
+
x
+
cn
));
__m128i
t0
=
_mm_sub_epi16
(
s1
,
s0
);
__m128i
t1
=
_mm_add_epi16
(
_mm_mullo_epi16
(
_mm_add_epi16
(
s2
,
s4
),
c3
),
_mm_mullo_epi16
(
s3
,
c10
));
__m128i
t2
=
_mm_unpacklo_epi16
(
t0
,
t1
);
t0
=
_mm_unpackhi_epi16
(
t0
,
t1
);
// this can probably be replaced with aligned stores if we aligned dst properly.
_mm_storeu_si128
((
__m128i
*
)(
drow
+
x
*
2
),
t2
);
_mm_storeu_si128
((
__m128i
*
)(
drow
+
x
*
2
+
8
),
t0
);
}
#endif
#if CV_NEON
for
(
;
x
<=
colsn
-
8
;
x
+=
8
)
#if CV_SIMD128
if
(
haveSIMD
)
{
for
(
;
x
<=
colsn
-
8
;
x
+=
8
)
{
v_int16x8
s0
=
v_load
(
trow0
+
x
-
cn
);
v_int16x8
s1
=
v_load
(
trow0
+
x
+
cn
);
v_int16x8
s2
=
v_load
(
trow1
+
x
-
cn
);
v_int16x8
s3
=
v_load
(
trow1
+
x
);
v_int16x8
s4
=
v_load
(
trow1
+
x
+
cn
);
int16x8_t
q0
=
vld1q_s16
((
const
int16_t
*
)
&
trow0
[
x
+
cn
]);
int16x8_t
q1
=
vld1q_s16
((
const
int16_t
*
)
&
trow0
[
x
-
cn
]);
int16x8_t
q2
=
vld1q_s16
((
const
int16_t
*
)
&
trow1
[
x
+
cn
]);
int16x8_t
q3
=
vld1q_s16
((
const
int16_t
*
)
&
trow1
[
x
-
cn
]);
int16x8_t
q5
=
vsubq_s16
(
q0
,
q1
);
int16x8_t
q6
=
vaddq_s16
(
q2
,
q3
);
int16x8_t
q4
=
vld1q_s16
((
const
int16_t
*
)
&
trow1
[
x
]);
int16x8_t
q7
=
vmulq_s16
(
q6
,
q8i
);
int16x8_t
q10
=
vmulq_s16
(
q4
,
q9
);
int16x8_t
q11
=
vaddq_s16
(
q7
,
q10
);
int16x4_t
d22
=
vget_low_s16
(
q11
);
int16x4_t
d23
=
vget_high_s16
(
q11
);
int16x4_t
d11
=
vget_high_s16
(
q5
);
int16x4_t
d10
=
vget_low_s16
(
q5
);
int16x4x2_t
q5x2
,
q11x2
;
q5x2
.
val
[
0
]
=
d10
;
q5x2
.
val
[
1
]
=
d22
;
q11x2
.
val
[
0
]
=
d11
;
q11x2
.
val
[
1
]
=
d23
;
vst2_s16
((
int16_t
*
)
&
drow
[
x
*
2
],
q5x2
);
vst2_s16
((
int16_t
*
)
&
drow
[(
x
*
2
)
+
8
],
q11x2
);
v_int16x8
t0
=
s1
-
s0
;
v_int16x8
t1
=
((
s2
+
s4
)
*
c3
)
+
(
s3
*
c10
);
v_store_interleave
((
drow
+
x
*
2
),
t0
,
t1
);
}
}
#endif
for
(
;
x
<
colsn
;
x
++
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment