Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
53785b6a
Commit
53785b6a
authored
Oct 11, 2018
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #12784 from terfendail:pyramids_wintr
parents
2332fb85
cc10e6b3
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
193 additions
and
632 deletions
+193
-632
intrin_sse.hpp
modules/core/include/opencv2/core/hal/intrin_sse.hpp
+20
-0
pyramids.cpp
modules/imgproc/src/pyramids.cpp
+173
-632
No files found.
modules/core/include/opencv2/core/hal/intrin_sse.hpp
View file @
53785b6a
...
...
@@ -472,6 +472,9 @@ void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
inline
v_uint16x8
v_pack_u
(
const
v_int32x4
&
a
,
const
v_int32x4
&
b
)
{
#if CV_SSE4_1
return
v_uint16x8
(
_mm_packus_epi32
(
a
.
val
,
b
.
val
));
#else
__m128i
delta32
=
_mm_set1_epi32
(
32768
);
// preliminary saturate negative values to zero
...
...
@@ -480,34 +483,51 @@ inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
__m128i
r
=
_mm_packs_epi32
(
_mm_sub_epi32
(
a1
,
delta32
),
_mm_sub_epi32
(
b1
,
delta32
));
return
v_uint16x8
(
_mm_sub_epi16
(
r
,
_mm_set1_epi16
(
-
32768
)));
#endif
}
inline
void
v_pack_u_store
(
ushort
*
ptr
,
const
v_int32x4
&
a
)
{
#if CV_SSE4_1
_mm_storel_epi64
((
__m128i
*
)
ptr
,
_mm_packus_epi32
(
a
.
val
,
a
.
val
));
#else
__m128i
delta32
=
_mm_set1_epi32
(
32768
);
__m128i
a1
=
_mm_sub_epi32
(
a
.
val
,
delta32
);
__m128i
r
=
_mm_sub_epi16
(
_mm_packs_epi32
(
a1
,
a1
),
_mm_set1_epi16
(
-
32768
));
_mm_storel_epi64
((
__m128i
*
)
ptr
,
r
);
#endif
}
template
<
int
n
>
inline
v_uint16x8
v_rshr_pack_u
(
const
v_int32x4
&
a
,
const
v_int32x4
&
b
)
{
#if CV_SSE4_1
__m128i
delta
=
_mm_set1_epi32
(
1
<<
(
n
-
1
));
return
v_uint16x8
(
_mm_packus_epi32
(
_mm_srai_epi32
(
_mm_add_epi32
(
a
.
val
,
delta
),
n
),
_mm_srai_epi32
(
_mm_add_epi32
(
b
.
val
,
delta
),
n
)));
#else
__m128i
delta
=
_mm_set1_epi32
(
1
<<
(
n
-
1
)),
delta32
=
_mm_set1_epi32
(
32768
);
__m128i
a1
=
_mm_sub_epi32
(
_mm_srai_epi32
(
_mm_add_epi32
(
a
.
val
,
delta
),
n
),
delta32
);
__m128i
a2
=
_mm_sub_epi16
(
_mm_packs_epi32
(
a1
,
a1
),
_mm_set1_epi16
(
-
32768
));
__m128i
b1
=
_mm_sub_epi32
(
_mm_srai_epi32
(
_mm_add_epi32
(
b
.
val
,
delta
),
n
),
delta32
);
__m128i
b2
=
_mm_sub_epi16
(
_mm_packs_epi32
(
b1
,
b1
),
_mm_set1_epi16
(
-
32768
));
return
v_uint16x8
(
_mm_unpacklo_epi64
(
a2
,
b2
));
#endif
}
template
<
int
n
>
inline
void
v_rshr_pack_u_store
(
ushort
*
ptr
,
const
v_int32x4
&
a
)
{
#if CV_SSE4_1
__m128i
delta
=
_mm_set1_epi32
(
1
<<
(
n
-
1
));
__m128i
a1
=
_mm_srai_epi32
(
_mm_add_epi32
(
a
.
val
,
delta
),
n
);
_mm_storel_epi64
((
__m128i
*
)
ptr
,
_mm_packus_epi32
(
a1
,
a1
));
#else
__m128i
delta
=
_mm_set1_epi32
(
1
<<
(
n
-
1
)),
delta32
=
_mm_set1_epi32
(
32768
);
__m128i
a1
=
_mm_sub_epi32
(
_mm_srai_epi32
(
_mm_add_epi32
(
a
.
val
,
delta
),
n
),
delta32
);
__m128i
a2
=
_mm_sub_epi16
(
_mm_packs_epi32
(
a1
,
a1
),
_mm_set1_epi16
(
-
32768
));
_mm_storel_epi64
((
__m128i
*
)
ptr
,
a2
);
#endif
}
inline
v_int16x8
v_pack
(
const
v_int32x4
&
a
,
const
v_int32x4
&
b
)
...
...
modules/imgproc/src/pyramids.cpp
View file @
53785b6a
...
...
@@ -43,6 +43,7 @@
#include "precomp.hpp"
#include "opencl_kernels_imgproc.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include "opencv2/core/openvx/ovx_defs.hpp"
...
...
@@ -73,69 +74,55 @@ template<typename T1, typename T2> struct PyrUpNoVec
int
operator
()(
T1
**
,
T2
**
,
int
,
int
)
const
{
return
0
;
}
};
#if CV_S
SE2
#if CV_S
IMD
struct
PyrDownVec_32s8u
{
int
operator
()(
int
**
src
,
uchar
*
dst
,
int
,
int
width
)
const
{
if
(
!
checkHardwareSupport
(
CV_CPU_SSE2
)
)
return
0
;
int
x
=
0
;
const
int
*
row0
=
src
[
0
],
*
row1
=
src
[
1
],
*
row2
=
src
[
2
],
*
row3
=
src
[
3
],
*
row4
=
src
[
4
];
__m128i
delta
=
_mm_set1_epi16
(
128
);
for
(
;
x
<=
width
-
16
;
x
+=
16
)
{
__m128i
r0
,
r1
,
r2
,
r3
,
r4
,
t0
,
t1
;
r0
=
_mm_packs_epi32
(
_mm_load_si128
((
const
__m128i
*
)(
row0
+
x
)),
_mm_load_si128
((
const
__m128i
*
)(
row0
+
x
+
4
)));
r1
=
_mm_packs_epi32
(
_mm_load_si128
((
const
__m128i
*
)(
row1
+
x
)),
_mm_load_si128
((
const
__m128i
*
)(
row1
+
x
+
4
)));
r2
=
_mm_packs_epi32
(
_mm_load_si128
((
const
__m128i
*
)(
row2
+
x
)),
_mm_load_si128
((
const
__m128i
*
)(
row2
+
x
+
4
)));
r3
=
_mm_packs_epi32
(
_mm_load_si128
((
const
__m128i
*
)(
row3
+
x
)),
_mm_load_si128
((
const
__m128i
*
)(
row3
+
x
+
4
)));
r4
=
_mm_packs_epi32
(
_mm_load_si128
((
const
__m128i
*
)(
row4
+
x
)),
_mm_load_si128
((
const
__m128i
*
)(
row4
+
x
+
4
)));
r0
=
_mm_add_epi16
(
r0
,
r4
);
r1
=
_mm_add_epi16
(
_mm_add_epi16
(
r1
,
r3
),
r2
);
r0
=
_mm_add_epi16
(
r0
,
_mm_add_epi16
(
r2
,
r2
));
t0
=
_mm_add_epi16
(
r0
,
_mm_slli_epi16
(
r1
,
2
));
r0
=
_mm_packs_epi32
(
_mm_load_si128
((
const
__m128i
*
)(
row0
+
x
+
8
)),
_mm_load_si128
((
const
__m128i
*
)(
row0
+
x
+
12
)));
r1
=
_mm_packs_epi32
(
_mm_load_si128
((
const
__m128i
*
)(
row1
+
x
+
8
)),
_mm_load_si128
((
const
__m128i
*
)(
row1
+
x
+
12
)));
r2
=
_mm_packs_epi32
(
_mm_load_si128
((
const
__m128i
*
)(
row2
+
x
+
8
)),
_mm_load_si128
((
const
__m128i
*
)(
row2
+
x
+
12
)));
r3
=
_mm_packs_epi32
(
_mm_load_si128
((
const
__m128i
*
)(
row3
+
x
+
8
)),
_mm_load_si128
((
const
__m128i
*
)(
row3
+
x
+
12
)));
r4
=
_mm_packs_epi32
(
_mm_load_si128
((
const
__m128i
*
)(
row4
+
x
+
8
)),
_mm_load_si128
((
const
__m128i
*
)(
row4
+
x
+
12
)));
r0
=
_mm_add_epi16
(
r0
,
r4
);
r1
=
_mm_add_epi16
(
_mm_add_epi16
(
r1
,
r3
),
r2
);
r0
=
_mm_add_epi16
(
r0
,
_mm_add_epi16
(
r2
,
r2
));
t1
=
_mm_add_epi16
(
r0
,
_mm_slli_epi16
(
r1
,
2
));
t0
=
_mm_srli_epi16
(
_mm_add_epi16
(
t0
,
delta
),
8
);
t1
=
_mm_srli_epi16
(
_mm_add_epi16
(
t1
,
delta
),
8
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
x
),
_mm_packus_epi16
(
t0
,
t1
));
}
for
(
;
x
<=
width
-
4
;
x
+=
4
)
{
__m128i
r0
,
r1
,
r2
,
r3
,
r4
,
z
=
_mm_setzero_si128
();
r0
=
_mm_packs_epi32
(
_mm_load_si128
((
const
__m128i
*
)(
row0
+
x
)),
z
);
r1
=
_mm_packs_epi32
(
_mm_load_si128
((
const
__m128i
*
)(
row1
+
x
)),
z
);
r2
=
_mm_packs_epi32
(
_mm_load_si128
((
const
__m128i
*
)(
row2
+
x
)),
z
);
r3
=
_mm_packs_epi32
(
_mm_load_si128
((
const
__m128i
*
)(
row3
+
x
)),
z
);
r4
=
_mm_packs_epi32
(
_mm_load_si128
((
const
__m128i
*
)(
row4
+
x
)),
z
);
r0
=
_mm_add_epi16
(
r0
,
r4
);
r1
=
_mm_add_epi16
(
_mm_add_epi16
(
r1
,
r3
),
r2
);
r0
=
_mm_add_epi16
(
r0
,
_mm_add_epi16
(
r2
,
r2
));
r0
=
_mm_add_epi16
(
r0
,
_mm_slli_epi16
(
r1
,
2
));
r0
=
_mm_srli_epi16
(
_mm_add_epi16
(
r0
,
delta
),
8
);
*
(
int
*
)(
dst
+
x
)
=
_mm_cvtsi128_si32
(
_mm_packus_epi16
(
r0
,
r0
));
for
(
;
x
<=
width
-
v_uint8
::
nlanes
;
x
+=
v_uint8
::
nlanes
)
{
v_uint16
r0
,
r1
,
r2
,
r3
,
r4
,
t0
,
t1
;
r0
=
v_reinterpret_as_u16
(
v_pack
(
vx_load
(
row0
+
x
),
vx_load
(
row0
+
x
+
v_int32
::
nlanes
)));
r1
=
v_reinterpret_as_u16
(
v_pack
(
vx_load
(
row1
+
x
),
vx_load
(
row1
+
x
+
v_int32
::
nlanes
)));
r2
=
v_reinterpret_as_u16
(
v_pack
(
vx_load
(
row2
+
x
),
vx_load
(
row2
+
x
+
v_int32
::
nlanes
)));
r3
=
v_reinterpret_as_u16
(
v_pack
(
vx_load
(
row3
+
x
),
vx_load
(
row3
+
x
+
v_int32
::
nlanes
)));
r4
=
v_reinterpret_as_u16
(
v_pack
(
vx_load
(
row4
+
x
),
vx_load
(
row4
+
x
+
v_int32
::
nlanes
)));
t0
=
r0
+
r4
+
(
r2
+
r2
)
+
((
r1
+
r3
+
r2
)
<<
2
);
r0
=
v_reinterpret_as_u16
(
v_pack
(
vx_load
(
row0
+
x
+
2
*
v_int32
::
nlanes
),
vx_load
(
row0
+
x
+
3
*
v_int32
::
nlanes
)));
r1
=
v_reinterpret_as_u16
(
v_pack
(
vx_load
(
row1
+
x
+
2
*
v_int32
::
nlanes
),
vx_load
(
row1
+
x
+
3
*
v_int32
::
nlanes
)));
r2
=
v_reinterpret_as_u16
(
v_pack
(
vx_load
(
row2
+
x
+
2
*
v_int32
::
nlanes
),
vx_load
(
row2
+
x
+
3
*
v_int32
::
nlanes
)));
r3
=
v_reinterpret_as_u16
(
v_pack
(
vx_load
(
row3
+
x
+
2
*
v_int32
::
nlanes
),
vx_load
(
row3
+
x
+
3
*
v_int32
::
nlanes
)));
r4
=
v_reinterpret_as_u16
(
v_pack
(
vx_load
(
row4
+
x
+
2
*
v_int32
::
nlanes
),
vx_load
(
row4
+
x
+
3
*
v_int32
::
nlanes
)));
t1
=
r0
+
r4
+
(
r2
+
r2
)
+
((
r1
+
r3
+
r2
)
<<
2
);
v_store
(
dst
+
x
,
v_rshr_pack
<
8
>
(
t0
,
t1
));
}
if
(
x
<=
width
-
v_int16
::
nlanes
)
{
v_uint16
r0
,
r1
,
r2
,
r3
,
r4
,
t0
;
r0
=
v_reinterpret_as_u16
(
v_pack
(
vx_load
(
row0
+
x
),
vx_load
(
row0
+
x
+
v_int32
::
nlanes
)));
r1
=
v_reinterpret_as_u16
(
v_pack
(
vx_load
(
row1
+
x
),
vx_load
(
row1
+
x
+
v_int32
::
nlanes
)));
r2
=
v_reinterpret_as_u16
(
v_pack
(
vx_load
(
row2
+
x
),
vx_load
(
row2
+
x
+
v_int32
::
nlanes
)));
r3
=
v_reinterpret_as_u16
(
v_pack
(
vx_load
(
row3
+
x
),
vx_load
(
row3
+
x
+
v_int32
::
nlanes
)));
r4
=
v_reinterpret_as_u16
(
v_pack
(
vx_load
(
row4
+
x
),
vx_load
(
row4
+
x
+
v_int32
::
nlanes
)));
t0
=
r0
+
r4
+
(
r2
+
r2
)
+
((
r1
+
r3
+
r2
)
<<
2
);
v_rshr_pack_store
<
8
>
(
dst
+
x
,
t0
);
x
+=
v_uint16
::
nlanes
;
}
for
(
;
x
<=
width
-
v_int32x4
::
nlanes
;
x
+=
v_int32x4
::
nlanes
)
{
v_int32x4
r0
,
r1
,
r2
,
r3
,
r4
,
t0
;
r0
=
v_load
(
row0
+
x
);
r1
=
v_load
(
row1
+
x
);
r2
=
v_load
(
row2
+
x
);
r3
=
v_load
(
row3
+
x
);
r4
=
v_load
(
row4
+
x
);
t0
=
r0
+
r4
+
(
r2
+
r2
)
+
((
r1
+
r3
+
r2
)
<<
2
);
*
(
int
*
)(
dst
+
x
)
=
v_reinterpret_as_s32
(
v_rshr_pack
<
8
>
(
v_pack_u
(
t0
,
t0
),
v_setzero_u16
())).
get0
();
}
return
x
;
...
...
@@ -146,152 +133,105 @@ struct PyrDownVec_32f
{
int
operator
()(
float
**
src
,
float
*
dst
,
int
,
int
width
)
const
{
if
(
!
checkHardwareSupport
(
CV_CPU_SSE
)
)
return
0
;
int
x
=
0
;
const
float
*
row0
=
src
[
0
],
*
row1
=
src
[
1
],
*
row2
=
src
[
2
],
*
row3
=
src
[
3
],
*
row4
=
src
[
4
];
__m128
_4
=
_mm_set1_ps
(
4.
f
),
_scale
=
_mm_set1_ps
(
1.
f
/
256
);
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
__m128
r0
,
r1
,
r2
,
r3
,
r4
,
t0
,
t1
;
r0
=
_mm_load_ps
(
row0
+
x
);
r1
=
_mm_load_ps
(
row1
+
x
);
r2
=
_mm_load_ps
(
row2
+
x
);
r3
=
_mm_load_ps
(
row3
+
x
);
r4
=
_mm_load_ps
(
row4
+
x
);
r0
=
_mm_add_ps
(
r0
,
r4
);
r1
=
_mm_add_ps
(
_mm_add_ps
(
r1
,
r3
),
r2
);
r0
=
_mm_add_ps
(
r0
,
_mm_add_ps
(
r2
,
r2
));
t0
=
_mm_add_ps
(
r0
,
_mm_mul_ps
(
r1
,
_4
));
r0
=
_mm_load_ps
(
row0
+
x
+
4
);
r1
=
_mm_load_ps
(
row1
+
x
+
4
);
r2
=
_mm_load_ps
(
row2
+
x
+
4
);
r3
=
_mm_load_ps
(
row3
+
x
+
4
);
r4
=
_mm_load_ps
(
row4
+
x
+
4
);
r0
=
_mm_add_ps
(
r0
,
r4
);
r1
=
_mm_add_ps
(
_mm_add_ps
(
r1
,
r3
),
r2
);
r0
=
_mm_add_ps
(
r0
,
_mm_add_ps
(
r2
,
r2
));
t1
=
_mm_add_ps
(
r0
,
_mm_mul_ps
(
r1
,
_4
));
t0
=
_mm_mul_ps
(
t0
,
_scale
);
t1
=
_mm_mul_ps
(
t1
,
_scale
);
_mm_storeu_ps
(
dst
+
x
,
t0
);
_mm_storeu_ps
(
dst
+
x
+
4
,
t1
);
v_float32
_4
=
vx_setall_f32
(
4.
f
),
_scale
=
vx_setall_f32
(
1.
f
/
256
);
for
(
;
x
<=
width
-
v_float32
::
nlanes
;
x
+=
v_float32
::
nlanes
)
{
v_float32
r0
,
r1
,
r2
,
r3
,
r4
;
r0
=
vx_load
(
row0
+
x
);
r1
=
vx_load
(
row1
+
x
);
r2
=
vx_load
(
row2
+
x
);
r3
=
vx_load
(
row3
+
x
);
r4
=
vx_load
(
row4
+
x
);
v_store
(
dst
+
x
,
v_muladd
(
r1
+
r3
+
r2
,
_4
,
r0
+
r4
+
(
r2
+
r2
))
*
_scale
);
}
return
x
;
}
};
#if CV_SSE4_1
#if CV_SSE4_1
|| CV_NEON
struct
PyrDownVec_32s16u
{
PyrDownVec_32s16u
()
{
haveSSE
=
checkHardwareSupport
(
CV_CPU_SSE4_1
);
}
int
operator
()(
int
**
src
,
ushort
*
dst
,
int
,
int
width
)
const
{
int
x
=
0
;
if
(
!
haveSSE
)
return
x
;
const
int
*
row0
=
src
[
0
],
*
row1
=
src
[
1
],
*
row2
=
src
[
2
],
*
row3
=
src
[
3
],
*
row4
=
src
[
4
];
__m128i
v_delta
=
_mm_set1_epi32
(
128
);
for
(
;
x
<=
width
-
8
;
x
+=
8
)
for
(
;
x
<=
width
-
v_uint16
::
nlanes
;
x
+=
v_uint16
::
nlanes
)
{
__m128i
v_r00
=
_mm_loadu_si128
((
__m128i
const
*
)(
row0
+
x
)),
v_r01
=
_mm_loadu_si128
((
__m128i
const
*
)(
row0
+
x
+
4
));
__m128i
v_r10
=
_mm_loadu_si128
((
__m128i
const
*
)(
row1
+
x
)),
v_r11
=
_mm_loadu_si128
((
__m128i
const
*
)(
row1
+
x
+
4
));
__m128i
v_r20
=
_mm_loadu_si128
((
__m128i
const
*
)(
row2
+
x
)),
v_r21
=
_mm_loadu_si128
((
__m128i
const
*
)(
row2
+
x
+
4
));
__m128i
v_r30
=
_mm_loadu_si128
((
__m128i
const
*
)(
row3
+
x
)),
v_r31
=
_mm_loadu_si128
((
__m128i
const
*
)(
row3
+
x
+
4
));
__m128i
v_r40
=
_mm_loadu_si128
((
__m128i
const
*
)(
row4
+
x
)),
v_r41
=
_mm_loadu_si128
((
__m128i
const
*
)(
row4
+
x
+
4
));
v_r00
=
_mm_add_epi32
(
_mm_add_epi32
(
v_r00
,
v_r40
),
_mm_add_epi32
(
v_r20
,
v_r20
));
v_r10
=
_mm_add_epi32
(
_mm_add_epi32
(
v_r10
,
v_r20
),
v_r30
);
v_r10
=
_mm_slli_epi32
(
v_r10
,
2
);
__m128i
v_dst0
=
_mm_srli_epi32
(
_mm_add_epi32
(
_mm_add_epi32
(
v_r00
,
v_r10
),
v_delta
),
8
);
v_r01
=
_mm_add_epi32
(
_mm_add_epi32
(
v_r01
,
v_r41
),
_mm_add_epi32
(
v_r21
,
v_r21
));
v_r11
=
_mm_add_epi32
(
_mm_add_epi32
(
v_r11
,
v_r21
),
v_r31
);
v_r11
=
_mm_slli_epi32
(
v_r11
,
2
);
__m128i
v_dst1
=
_mm_srli_epi32
(
_mm_add_epi32
(
_mm_add_epi32
(
v_r01
,
v_r11
),
v_delta
),
8
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
x
),
_mm_packus_epi32
(
v_dst0
,
v_dst1
));
v_int32
r00
=
vx_load
(
row0
+
x
),
r01
=
vx_load
(
row0
+
x
+
v_int32
::
nlanes
),
r10
=
vx_load
(
row1
+
x
),
r11
=
vx_load
(
row1
+
x
+
v_int32
::
nlanes
),
r20
=
vx_load
(
row2
+
x
),
r21
=
vx_load
(
row2
+
x
+
v_int32
::
nlanes
),
r30
=
vx_load
(
row3
+
x
),
r31
=
vx_load
(
row3
+
x
+
v_int32
::
nlanes
),
r40
=
vx_load
(
row4
+
x
),
r41
=
vx_load
(
row4
+
x
+
v_int32
::
nlanes
);
v_store
(
dst
+
x
,
v_rshr_pack_u
<
8
>
(
r00
+
r40
+
(
r20
+
r20
)
+
((
r10
+
r20
+
r30
)
<<
2
),
r01
+
r41
+
(
r21
+
r21
)
+
((
r11
+
r21
+
r31
)
<<
2
)));
}
if
(
x
<=
width
-
v_int32
::
nlanes
)
{
v_int32
r00
=
vx_load
(
row0
+
x
),
r10
=
vx_load
(
row1
+
x
),
r20
=
vx_load
(
row2
+
x
),
r30
=
vx_load
(
row3
+
x
),
r40
=
vx_load
(
row4
+
x
);
v_rshr_pack_u_store
<
8
>
(
dst
+
x
,
r00
+
r40
+
(
r20
+
r20
)
+
((
r10
+
r20
+
r30
)
<<
2
));
x
+=
v_int32
::
nlanes
;
}
return
x
;
}
bool
haveSSE
;
};
#else
typedef
PyrDownNoVec
<
int
,
ushort
>
PyrDownVec_32s16u
;
#endif
// CV_SSE4_1
#endif
struct
PyrDownVec_32s16s
{
PyrDownVec_32s16s
()
{
haveSSE
=
checkHardwareSupport
(
CV_CPU_SSE2
);
}
int
operator
()(
int
**
src
,
short
*
dst
,
int
,
int
width
)
const
{
int
x
=
0
;
if
(
!
haveSSE
)
return
x
;
const
int
*
row0
=
src
[
0
],
*
row1
=
src
[
1
],
*
row2
=
src
[
2
],
*
row3
=
src
[
3
],
*
row4
=
src
[
4
];
__m128i
v_delta
=
_mm_set1_epi32
(
128
);
for
(
;
x
<=
width
-
8
;
x
+=
8
)
for
(
;
x
<=
width
-
v_int16
::
nlanes
;
x
+=
v_int16
::
nlanes
)
{
__m128i
v_r00
=
_mm_loadu_si128
((
__m128i
const
*
)(
row0
+
x
)),
v_r01
=
_mm_loadu_si128
((
__m128i
const
*
)(
row0
+
x
+
4
));
__m128i
v_r10
=
_mm_loadu_si128
((
__m128i
const
*
)(
row1
+
x
)),
v_r11
=
_mm_loadu_si128
((
__m128i
const
*
)(
row1
+
x
+
4
));
__m128i
v_r20
=
_mm_loadu_si128
((
__m128i
const
*
)(
row2
+
x
)),
v_r21
=
_mm_loadu_si128
((
__m128i
const
*
)(
row2
+
x
+
4
));
__m128i
v_r30
=
_mm_loadu_si128
((
__m128i
const
*
)(
row3
+
x
)),
v_r31
=
_mm_loadu_si128
((
__m128i
const
*
)(
row3
+
x
+
4
));
__m128i
v_r40
=
_mm_loadu_si128
((
__m128i
const
*
)(
row4
+
x
)),
v_r41
=
_mm_loadu_si128
((
__m128i
const
*
)(
row4
+
x
+
4
));
v_r00
=
_mm_add_epi32
(
_mm_add_epi32
(
v_r00
,
v_r40
),
_mm_add_epi32
(
v_r20
,
v_r20
));
v_r10
=
_mm_add_epi32
(
_mm_add_epi32
(
v_r10
,
v_r20
),
v_r30
);
v_r10
=
_mm_slli_epi32
(
v_r10
,
2
);
__m128i
v_dst0
=
_mm_srai_epi32
(
_mm_add_epi32
(
_mm_add_epi32
(
v_r00
,
v_r10
),
v_delta
),
8
);
v_r01
=
_mm_add_epi32
(
_mm_add_epi32
(
v_r01
,
v_r41
),
_mm_add_epi32
(
v_r21
,
v_r21
));
v_r11
=
_mm_add_epi32
(
_mm_add_epi32
(
v_r11
,
v_r21
),
v_r31
);
v_r11
=
_mm_slli_epi32
(
v_r11
,
2
);
__m128i
v_dst1
=
_mm_srai_epi32
(
_mm_add_epi32
(
_mm_add_epi32
(
v_r01
,
v_r11
),
v_delta
),
8
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
x
),
_mm_packs_epi32
(
v_dst0
,
v_dst1
));
v_int32
r00
=
vx_load
(
row0
+
x
),
r01
=
vx_load
(
row0
+
x
+
v_int32
::
nlanes
),
r10
=
vx_load
(
row1
+
x
),
r11
=
vx_load
(
row1
+
x
+
v_int32
::
nlanes
),
r20
=
vx_load
(
row2
+
x
),
r21
=
vx_load
(
row2
+
x
+
v_int32
::
nlanes
),
r30
=
vx_load
(
row3
+
x
),
r31
=
vx_load
(
row3
+
x
+
v_int32
::
nlanes
),
r40
=
vx_load
(
row4
+
x
),
r41
=
vx_load
(
row4
+
x
+
v_int32
::
nlanes
);
v_store
(
dst
+
x
,
v_rshr_pack
<
8
>
(
r00
+
r40
+
(
r20
+
r20
)
+
((
r10
+
r20
+
r30
)
<<
2
),
r01
+
r41
+
(
r21
+
r21
)
+
((
r11
+
r21
+
r31
)
<<
2
)));
}
if
(
x
<=
width
-
v_int32
::
nlanes
)
{
v_int32
r00
=
vx_load
(
row0
+
x
),
r10
=
vx_load
(
row1
+
x
),
r20
=
vx_load
(
row2
+
x
),
r30
=
vx_load
(
row3
+
x
),
r40
=
vx_load
(
row4
+
x
);
v_rshr_pack_store
<
8
>
(
dst
+
x
,
r00
+
r40
+
(
r20
+
r20
)
+
((
r10
+
r20
+
r30
)
<<
2
));
x
+=
v_int32
::
nlanes
;
}
return
x
;
}
bool
haveSSE
;
};
struct
PyrUpVec_32s8u
...
...
@@ -299,59 +239,40 @@ struct PyrUpVec_32s8u
int
operator
()(
int
**
src
,
uchar
**
dst
,
int
,
int
width
)
const
{
int
x
=
0
;
if
(
!
checkHardwareSupport
(
CV_CPU_SSE2
))
return
x
;
uchar
*
dst0
=
dst
[
0
],
*
dst1
=
dst
[
1
];
const
uint
*
row0
=
(
uint
*
)
src
[
0
],
*
row1
=
(
uint
*
)
src
[
1
],
*
row2
=
(
uint
*
)
src
[
2
];
__m128i
v_delta
=
_mm_set1_epi16
(
32
),
v_zero
=
_mm_setzero_si128
();
const
int
*
row0
=
src
[
0
],
*
row1
=
src
[
1
],
*
row2
=
src
[
2
];
for
(
;
x
<=
width
-
16
;
x
+=
16
)
for
(
;
x
<=
width
-
v_uint8
::
nlanes
;
x
+=
v_uint8
::
nlanes
)
{
__m128i
v_r0
=
_mm_packs_epi32
(
_mm_loadu_si128
((
__m128i
const
*
)(
row0
+
x
)),
_mm_loadu_si128
((
__m128i
const
*
)(
row0
+
x
+
4
)));
__m128i
v_r1
=
_mm_packs_epi32
(
_mm_loadu_si128
((
__m128i
const
*
)(
row1
+
x
)),
_mm_loadu_si128
((
__m128i
const
*
)(
row1
+
x
+
4
)));
__m128i
v_r2
=
_mm_packs_epi32
(
_mm_loadu_si128
((
__m128i
const
*
)(
row2
+
x
)),
_mm_loadu_si128
((
__m128i
const
*
)(
row2
+
x
+
4
)));
__m128i
v_2r1
=
_mm_adds_epu16
(
v_r1
,
v_r1
),
v_4r1
=
_mm_adds_epu16
(
v_2r1
,
v_2r1
);
__m128i
v_dst00
=
_mm_adds_epu16
(
_mm_adds_epu16
(
v_r0
,
v_r2
),
_mm_adds_epu16
(
v_2r1
,
v_4r1
));
__m128i
v_dst10
=
_mm_slli_epi16
(
_mm_adds_epu16
(
v_r1
,
v_r2
),
2
);
v_r0
=
_mm_packs_epi32
(
_mm_loadu_si128
((
__m128i
const
*
)(
row0
+
x
+
8
)),
_mm_loadu_si128
((
__m128i
const
*
)(
row0
+
x
+
12
)));
v_r1
=
_mm_packs_epi32
(
_mm_loadu_si128
((
__m128i
const
*
)(
row1
+
x
+
8
)),
_mm_loadu_si128
((
__m128i
const
*
)(
row1
+
x
+
12
)));
v_r2
=
_mm_packs_epi32
(
_mm_loadu_si128
((
__m128i
const
*
)(
row2
+
x
+
8
)),
_mm_loadu_si128
((
__m128i
const
*
)(
row2
+
x
+
12
)));
v_2r1
=
_mm_adds_epu16
(
v_r1
,
v_r1
),
v_4r1
=
_mm_adds_epu16
(
v_2r1
,
v_2r1
);
__m128i
v_dst01
=
_mm_adds_epu16
(
_mm_adds_epu16
(
v_r0
,
v_r2
),
_mm_adds_epu16
(
v_2r1
,
v_4r1
));
__m128i
v_dst11
=
_mm_slli_epi16
(
_mm_adds_epu16
(
v_r1
,
v_r2
),
2
);
_mm_storeu_si128
((
__m128i
*
)(
dst0
+
x
),
_mm_packus_epi16
(
_mm_srli_epi16
(
_mm_adds_epu16
(
v_dst00
,
v_delta
),
6
),
_mm_srli_epi16
(
_mm_adds_epu16
(
v_dst01
,
v_delta
),
6
)));
_mm_storeu_si128
((
__m128i
*
)(
dst1
+
x
),
_mm_packus_epi16
(
_mm_srli_epi16
(
_mm_adds_epu16
(
v_dst10
,
v_delta
),
6
),
_mm_srli_epi16
(
_mm_adds_epu16
(
v_dst11
,
v_delta
),
6
)));
v_int16
v_r00
=
v_pack
(
vx_load
(
row0
+
x
),
vx_load
(
row0
+
x
+
v_int32
::
nlanes
)),
v_r01
=
v_pack
(
vx_load
(
row0
+
x
+
2
*
v_int32
::
nlanes
),
vx_load
(
row0
+
x
+
3
*
v_int32
::
nlanes
)),
v_r10
=
v_pack
(
vx_load
(
row1
+
x
),
vx_load
(
row1
+
x
+
v_int32
::
nlanes
)),
v_r11
=
v_pack
(
vx_load
(
row1
+
x
+
2
*
v_int32
::
nlanes
),
vx_load
(
row1
+
x
+
3
*
v_int32
::
nlanes
)),
v_r20
=
v_pack
(
vx_load
(
row2
+
x
),
vx_load
(
row2
+
x
+
v_int32
::
nlanes
)),
v_r21
=
v_pack
(
vx_load
(
row2
+
x
+
2
*
v_int32
::
nlanes
),
vx_load
(
row2
+
x
+
3
*
v_int32
::
nlanes
));
v_int16
v_2r10
=
v_r10
+
v_r10
,
v_2r11
=
(
v_r11
+
v_r11
);
v_store
(
dst0
+
x
,
v_rshr_pack_u
<
6
>
(
v_r00
+
v_r20
+
(
v_2r10
+
v_2r10
+
v_2r10
),
v_r01
+
v_r21
+
(
v_2r11
+
v_2r11
+
v_2r11
)));
v_store
(
dst1
+
x
,
v_rshr_pack_u
<
6
>
((
v_r10
+
v_r20
)
<<
2
,
(
v_r11
+
v_r21
)
<<
2
));
}
for
(
;
x
<=
width
-
8
;
x
+=
8
)
if
(
x
<=
width
-
v_uint16
::
nlanes
)
{
__m128i
v_r0
=
_mm_packs_epi32
(
_mm_loadu_si128
((
__m128i
const
*
)(
row0
+
x
)),
_mm_loadu_si128
((
__m128i
const
*
)(
row0
+
x
+
4
)));
__m128i
v_r1
=
_mm_packs_epi32
(
_mm_loadu_si128
((
__m128i
const
*
)(
row1
+
x
)),
_mm_loadu_si128
((
__m128i
const
*
)(
row1
+
x
+
4
)));
__m128i
v_r2
=
_mm_packs_epi32
(
_mm_loadu_si128
((
__m128i
const
*
)(
row2
+
x
)),
_mm_loadu_si128
((
__m128i
const
*
)(
row2
+
x
+
4
)));
__m128i
v_2r1
=
_mm_adds_epu16
(
v_r1
,
v_r1
),
v_4r1
=
_mm_adds_epu16
(
v_2r1
,
v_2r1
);
__m128i
v_dst0
=
_mm_adds_epu16
(
_mm_adds_epu16
(
v_r0
,
v_r2
),
_mm_adds_epu16
(
v_2r1
,
v_4r1
));
__m128i
v_dst1
=
_mm_slli_epi16
(
_mm_adds_epu16
(
v_r1
,
v_r2
),
2
);
_mm_storel_epi64
((
__m128i
*
)(
dst0
+
x
),
_mm_packus_epi16
(
_mm_srli_epi16
(
_mm_adds_epu16
(
v_dst0
,
v_delta
),
6
),
v_zero
));
_mm_storel_epi64
((
__m128i
*
)(
dst1
+
x
),
_mm_packus_epi16
(
_mm_srli_epi16
(
_mm_adds_epu16
(
v_dst1
,
v_delta
),
6
),
v_zero
));
v_int16
v_r00
=
v_pack
(
vx_load
(
row0
+
x
),
vx_load
(
row0
+
x
+
v_int32
::
nlanes
)),
v_r10
=
v_pack
(
vx_load
(
row1
+
x
),
vx_load
(
row1
+
x
+
v_int32
::
nlanes
)),
v_r20
=
v_pack
(
vx_load
(
row2
+
x
),
vx_load
(
row2
+
x
+
v_int32
::
nlanes
));
v_int16
v_2r10
=
v_r10
+
v_r10
;
v_rshr_pack_u_store
<
6
>
(
dst0
+
x
,
v_r00
+
v_r20
+
(
v_2r10
+
v_2r10
+
v_2r10
));
v_rshr_pack_u_store
<
6
>
(
dst1
+
x
,
(
v_r10
+
v_r20
)
<<
2
);
x
+=
v_uint16
::
nlanes
;
}
for
(;
x
<=
width
-
v_int32x4
::
nlanes
;
x
+=
v_int32x4
::
nlanes
)
{
v_int32
v_r00
=
vx_load
(
row0
+
x
),
v_r10
=
vx_load
(
row1
+
x
),
v_r20
=
vx_load
(
row2
+
x
);
v_int32
v_2r10
=
v_r10
+
v_r10
;
v_int16
d
=
v_pack
(
v_r00
+
v_r20
+
(
v_2r10
+
v_2r10
+
v_2r10
),
(
v_r10
+
v_r20
)
<<
2
);
*
(
int
*
)(
dst0
+
x
)
=
v_reinterpret_as_s32
(
v_rshr_pack_u
<
6
>
(
d
,
vx_setzero_s16
())).
get0
();
*
(
int
*
)(
dst1
+
x
)
=
v_reinterpret_as_s32
(
v_rshr_pack_u
<
6
>
(
v_combine_high
(
d
,
d
),
vx_setzero_s16
())).
get0
();
}
return
x
;
...
...
@@ -363,113 +284,63 @@ struct PyrUpVec_32s16s
int
operator
()(
int
**
src
,
short
**
dst
,
int
,
int
width
)
const
{
int
x
=
0
;
if
(
!
checkHardwareSupport
(
CV_CPU_SSE2
))
return
x
;
short
*
dst0
=
dst
[
0
],
*
dst1
=
dst
[
1
];
const
uint
*
row0
=
(
uint
*
)
src
[
0
],
*
row1
=
(
uint
*
)
src
[
1
],
*
row2
=
(
uint
*
)
src
[
2
];
__m128i
v_delta
=
_mm_set1_epi32
(
32
),
v_zero
=
_mm_setzero_si128
();
const
int
*
row0
=
src
[
0
],
*
row1
=
src
[
1
],
*
row2
=
src
[
2
];
for
(
;
x
<=
width
-
8
;
x
+=
8
)
for
(
;
x
<=
width
-
v_int16
::
nlanes
;
x
+=
v_int16
::
nlanes
)
{
__m128i
v_r0
=
_mm_loadu_si128
((
__m128i
const
*
)(
row0
+
x
)),
v_r1
=
_mm_loadu_si128
((
__m128i
const
*
)(
row1
+
x
)),
v_r2
=
_mm_loadu_si128
((
__m128i
const
*
)(
row2
+
x
));
__m128i
v_2r1
=
_mm_slli_epi32
(
v_r1
,
1
),
v_4r1
=
_mm_slli_epi32
(
v_r1
,
2
);
__m128i
v_dst00
=
_mm_add_epi32
(
_mm_add_epi32
(
v_r0
,
v_r2
),
_mm_add_epi32
(
v_2r1
,
v_4r1
));
__m128i
v_dst10
=
_mm_slli_epi32
(
_mm_add_epi32
(
v_r1
,
v_r2
),
2
);
v_r0
=
_mm_loadu_si128
((
__m128i
const
*
)(
row0
+
x
+
4
));
v_r1
=
_mm_loadu_si128
((
__m128i
const
*
)(
row1
+
x
+
4
));
v_r2
=
_mm_loadu_si128
((
__m128i
const
*
)(
row2
+
x
+
4
));
v_2r1
=
_mm_slli_epi32
(
v_r1
,
1
);
v_4r1
=
_mm_slli_epi32
(
v_r1
,
2
);
__m128i
v_dst01
=
_mm_add_epi32
(
_mm_add_epi32
(
v_r0
,
v_r2
),
_mm_add_epi32
(
v_2r1
,
v_4r1
));
__m128i
v_dst11
=
_mm_slli_epi32
(
_mm_add_epi32
(
v_r1
,
v_r2
),
2
);
_mm_storeu_si128
((
__m128i
*
)(
dst0
+
x
),
_mm_packs_epi32
(
_mm_srai_epi32
(
_mm_add_epi32
(
v_dst00
,
v_delta
),
6
),
_mm_srai_epi32
(
_mm_add_epi32
(
v_dst01
,
v_delta
),
6
)));
_mm_storeu_si128
((
__m128i
*
)(
dst1
+
x
),
_mm_packs_epi32
(
_mm_srai_epi32
(
_mm_add_epi32
(
v_dst10
,
v_delta
),
6
),
_mm_srai_epi32
(
_mm_add_epi32
(
v_dst11
,
v_delta
),
6
)));
v_int32
v_r00
=
vx_load
(
row0
+
x
),
v_r01
=
vx_load
(
row0
+
x
+
v_int32
::
nlanes
),
v_r10
=
vx_load
(
row1
+
x
),
v_r11
=
vx_load
(
row1
+
x
+
v_int32
::
nlanes
),
v_r20
=
vx_load
(
row2
+
x
),
v_r21
=
vx_load
(
row2
+
x
+
v_int32
::
nlanes
);
v_store
(
dst0
+
x
,
v_rshr_pack
<
6
>
(
v_r00
+
v_r20
+
((
v_r10
<<
1
)
+
(
v_r10
<<
2
)),
v_r01
+
v_r21
+
((
v_r11
<<
1
)
+
(
v_r11
<<
2
))));
v_store
(
dst1
+
x
,
v_rshr_pack
<
6
>
((
v_r10
+
v_r20
)
<<
2
,
(
v_r11
+
v_r21
)
<<
2
));
}
for
(
;
x
<=
width
-
4
;
x
+=
4
)
if
(
x
<=
width
-
v_int32
::
nlanes
)
{
__m128i
v_r0
=
_mm_loadu_si128
((
__m128i
const
*
)(
row0
+
x
)),
v_r1
=
_mm_loadu_si128
((
__m128i
const
*
)(
row1
+
x
)),
v_r2
=
_mm_loadu_si128
((
__m128i
const
*
)(
row2
+
x
));
__m128i
v_2r1
=
_mm_slli_epi32
(
v_r1
,
1
),
v_4r1
=
_mm_slli_epi32
(
v_r1
,
2
);
__m128i
v_dst0
=
_mm_add_epi32
(
_mm_add_epi32
(
v_r0
,
v_r2
),
_mm_add_epi32
(
v_2r1
,
v_4r1
));
__m128i
v_dst1
=
_mm_slli_epi32
(
_mm_add_epi32
(
v_r1
,
v_r2
),
2
);
_mm_storel_epi64
((
__m128i
*
)(
dst0
+
x
),
_mm_packs_epi32
(
_mm_srai_epi32
(
_mm_add_epi32
(
v_dst0
,
v_delta
),
6
),
v_zero
));
_mm_storel_epi64
((
__m128i
*
)(
dst1
+
x
),
_mm_packs_epi32
(
_mm_srai_epi32
(
_mm_add_epi32
(
v_dst1
,
v_delta
),
6
),
v_zero
));
v_int32
v_r00
=
vx_load
(
row0
+
x
),
v_r10
=
vx_load
(
row1
+
x
),
v_r20
=
vx_load
(
row2
+
x
);
v_rshr_pack_store
<
6
>
(
dst0
+
x
,
v_r00
+
v_r20
+
((
v_r10
<<
1
)
+
(
v_r10
<<
2
)));
v_rshr_pack_store
<
6
>
(
dst1
+
x
,
(
v_r10
+
v_r20
)
<<
2
);
x
+=
v_int32
::
nlanes
;
}
return
x
;
}
};
#if CV_SSE4_1
#if CV_SSE4_1
|| CV_NEON
struct
PyrUpVec_32s16u
{
int
operator
()(
int
**
src
,
ushort
**
dst
,
int
,
int
width
)
const
{
int
x
=
0
;
if
(
!
checkHardwareSupport
(
CV_CPU_SSE4_1
))
return
x
;
ushort
*
dst0
=
dst
[
0
],
*
dst1
=
dst
[
1
];
const
uint
*
row0
=
(
uint
*
)
src
[
0
],
*
row1
=
(
uint
*
)
src
[
1
],
*
row2
=
(
uint
*
)
src
[
2
];
__m128i
v_delta
=
_mm_set1_epi32
(
32
),
v_zero
=
_mm_setzero_si128
();
const
int
*
row0
=
src
[
0
],
*
row1
=
src
[
1
],
*
row2
=
src
[
2
];
for
(
;
x
<=
width
-
8
;
x
+=
8
)
for
(
;
x
<=
width
-
v_uint16
::
nlanes
;
x
+=
v_uint16
::
nlanes
)
{
__m128i
v_r0
=
_mm_loadu_si128
((
__m128i
const
*
)(
row0
+
x
)),
v_r1
=
_mm_loadu_si128
((
__m128i
const
*
)(
row1
+
x
)),
v_r2
=
_mm_loadu_si128
((
__m128i
const
*
)(
row2
+
x
));
__m128i
v_2r1
=
_mm_slli_epi32
(
v_r1
,
1
),
v_4r1
=
_mm_slli_epi32
(
v_r1
,
2
);
__m128i
v_dst00
=
_mm_add_epi32
(
_mm_add_epi32
(
v_r0
,
v_r2
),
_mm_add_epi32
(
v_2r1
,
v_4r1
));
__m128i
v_dst10
=
_mm_slli_epi32
(
_mm_add_epi32
(
v_r1
,
v_r2
),
2
);
v_r0
=
_mm_loadu_si128
((
__m128i
const
*
)(
row0
+
x
+
4
));
v_r1
=
_mm_loadu_si128
((
__m128i
const
*
)(
row1
+
x
+
4
));
v_r2
=
_mm_loadu_si128
((
__m128i
const
*
)(
row2
+
x
+
4
));
v_2r1
=
_mm_slli_epi32
(
v_r1
,
1
);
v_4r1
=
_mm_slli_epi32
(
v_r1
,
2
);
__m128i
v_dst01
=
_mm_add_epi32
(
_mm_add_epi32
(
v_r0
,
v_r2
),
_mm_add_epi32
(
v_2r1
,
v_4r1
));
__m128i
v_dst11
=
_mm_slli_epi32
(
_mm_add_epi32
(
v_r1
,
v_r2
),
2
);
_mm_storeu_si128
((
__m128i
*
)(
dst0
+
x
),
_mm_packus_epi32
(
_mm_srli_epi32
(
_mm_add_epi32
(
v_dst00
,
v_delta
),
6
),
_mm_srli_epi32
(
_mm_add_epi32
(
v_dst01
,
v_delta
),
6
)));
_mm_storeu_si128
((
__m128i
*
)(
dst1
+
x
),
_mm_packus_epi32
(
_mm_srli_epi32
(
_mm_add_epi32
(
v_dst10
,
v_delta
),
6
),
_mm_srli_epi32
(
_mm_add_epi32
(
v_dst11
,
v_delta
),
6
)));
v_int32
v_r00
=
vx_load
(
row0
+
x
),
v_r01
=
vx_load
(
row0
+
x
+
v_int32
::
nlanes
),
v_r10
=
vx_load
(
row1
+
x
),
v_r11
=
vx_load
(
row1
+
x
+
v_int32
::
nlanes
),
v_r20
=
vx_load
(
row2
+
x
),
v_r21
=
vx_load
(
row2
+
x
+
v_int32
::
nlanes
);
v_store
(
dst0
+
x
,
v_rshr_pack_u
<
6
>
(
v_r00
+
v_r20
+
((
v_r10
<<
1
)
+
(
v_r10
<<
2
)),
v_r01
+
v_r21
+
((
v_r11
<<
1
)
+
(
v_r11
<<
2
))));
v_store
(
dst1
+
x
,
v_rshr_pack_u
<
6
>
((
v_r10
+
v_r20
)
<<
2
,
(
v_r11
+
v_r21
)
<<
2
));
}
for
(
;
x
<=
width
-
4
;
x
+=
4
)
if
(
x
<=
width
-
v_int32
::
nlanes
)
{
__m128i
v_r0
=
_mm_loadu_si128
((
__m128i
const
*
)(
row0
+
x
)),
v_r1
=
_mm_loadu_si128
((
__m128i
const
*
)(
row1
+
x
)),
v_r2
=
_mm_loadu_si128
((
__m128i
const
*
)(
row2
+
x
));
__m128i
v_2r1
=
_mm_slli_epi32
(
v_r1
,
1
),
v_4r1
=
_mm_slli_epi32
(
v_r1
,
2
);
__m128i
v_dst0
=
_mm_add_epi32
(
_mm_add_epi32
(
v_r0
,
v_r2
),
_mm_add_epi32
(
v_2r1
,
v_4r1
));
__m128i
v_dst1
=
_mm_slli_epi32
(
_mm_add_epi32
(
v_r1
,
v_r2
),
2
);
_mm_storel_epi64
((
__m128i
*
)(
dst0
+
x
),
_mm_packus_epi32
(
_mm_srli_epi32
(
_mm_add_epi32
(
v_dst0
,
v_delta
),
6
),
v_zero
));
_mm_storel_epi64
((
__m128i
*
)(
dst1
+
x
),
_mm_packus_epi32
(
_mm_srli_epi32
(
_mm_add_epi32
(
v_dst1
,
v_delta
),
6
),
v_zero
));
v_int32
v_r00
=
vx_load
(
row0
+
x
),
v_r10
=
vx_load
(
row1
+
x
),
v_r20
=
vx_load
(
row2
+
x
);
v_rshr_pack_u_store
<
6
>
(
dst0
+
x
,
v_r00
+
v_r20
+
((
v_r10
<<
1
)
+
(
v_r10
<<
2
)));
v_rshr_pack_u_store
<
6
>
(
dst1
+
x
,
(
v_r10
+
v_r20
)
<<
2
);
x
+=
v_int32
::
nlanes
;
}
return
x
;
...
...
@@ -482,328 +353,6 @@ typedef PyrUpNoVec<int, ushort> PyrUpVec_32s16u;
#endif // CV_SSE4_1
struct
PyrUpVec_32f
{
int
operator
()(
float
**
src
,
float
**
dst
,
int
,
int
width
)
const
{
int
x
=
0
;
if
(
!
checkHardwareSupport
(
CV_CPU_SSE2
))
return
x
;
const
float
*
row0
=
src
[
0
],
*
row1
=
src
[
1
],
*
row2
=
src
[
2
];
float
*
dst0
=
dst
[
0
],
*
dst1
=
dst
[
1
];
__m128
v_6
=
_mm_set1_ps
(
6.0
f
),
v_scale
=
_mm_set1_ps
(
1.
f
/
64.0
f
),
v_scale4
=
_mm_mul_ps
(
v_scale
,
_mm_set1_ps
(
4.0
f
));
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
__m128
v_r0
=
_mm_loadu_ps
(
row0
+
x
);
__m128
v_r1
=
_mm_loadu_ps
(
row1
+
x
);
__m128
v_r2
=
_mm_loadu_ps
(
row2
+
x
);
_mm_storeu_ps
(
dst1
+
x
,
_mm_mul_ps
(
v_scale4
,
_mm_add_ps
(
v_r1
,
v_r2
)));
_mm_storeu_ps
(
dst0
+
x
,
_mm_mul_ps
(
v_scale
,
_mm_add_ps
(
_mm_add_ps
(
v_r0
,
_mm_mul_ps
(
v_6
,
v_r1
)),
v_r2
)));
v_r0
=
_mm_loadu_ps
(
row0
+
x
+
4
);
v_r1
=
_mm_loadu_ps
(
row1
+
x
+
4
);
v_r2
=
_mm_loadu_ps
(
row2
+
x
+
4
);
_mm_storeu_ps
(
dst1
+
x
+
4
,
_mm_mul_ps
(
v_scale4
,
_mm_add_ps
(
v_r1
,
v_r2
)));
_mm_storeu_ps
(
dst0
+
x
+
4
,
_mm_mul_ps
(
v_scale
,
_mm_add_ps
(
_mm_add_ps
(
v_r0
,
_mm_mul_ps
(
v_6
,
v_r1
)),
v_r2
)));
}
return
x
;
}
};
#elif CV_NEON
struct
PyrDownVec_32s8u
{
int
operator
()(
int
**
src
,
uchar
*
dst
,
int
,
int
width
)
const
{
int
x
=
0
;
const
unsigned
int
*
row0
=
(
unsigned
int
*
)
src
[
0
],
*
row1
=
(
unsigned
int
*
)
src
[
1
],
*
row2
=
(
unsigned
int
*
)
src
[
2
],
*
row3
=
(
unsigned
int
*
)
src
[
3
],
*
row4
=
(
unsigned
int
*
)
src
[
4
];
uint16x8_t
v_delta
=
vdupq_n_u16
(
128
);
for
(
;
x
<=
width
-
16
;
x
+=
16
)
{
uint16x8_t
v_r0
=
vcombine_u16
(
vqmovn_u32
(
vld1q_u32
(
row0
+
x
)),
vqmovn_u32
(
vld1q_u32
(
row0
+
x
+
4
)));
uint16x8_t
v_r1
=
vcombine_u16
(
vqmovn_u32
(
vld1q_u32
(
row1
+
x
)),
vqmovn_u32
(
vld1q_u32
(
row1
+
x
+
4
)));
uint16x8_t
v_r2
=
vcombine_u16
(
vqmovn_u32
(
vld1q_u32
(
row2
+
x
)),
vqmovn_u32
(
vld1q_u32
(
row2
+
x
+
4
)));
uint16x8_t
v_r3
=
vcombine_u16
(
vqmovn_u32
(
vld1q_u32
(
row3
+
x
)),
vqmovn_u32
(
vld1q_u32
(
row3
+
x
+
4
)));
uint16x8_t
v_r4
=
vcombine_u16
(
vqmovn_u32
(
vld1q_u32
(
row4
+
x
)),
vqmovn_u32
(
vld1q_u32
(
row4
+
x
+
4
)));
v_r0
=
vaddq_u16
(
vaddq_u16
(
v_r0
,
v_r4
),
vaddq_u16
(
v_r2
,
v_r2
));
v_r1
=
vaddq_u16
(
vaddq_u16
(
v_r1
,
v_r2
),
v_r3
);
uint16x8_t
v_dst0
=
vaddq_u16
(
v_r0
,
vshlq_n_u16
(
v_r1
,
2
));
v_r0
=
vcombine_u16
(
vqmovn_u32
(
vld1q_u32
(
row0
+
x
+
8
)),
vqmovn_u32
(
vld1q_u32
(
row0
+
x
+
12
)));
v_r1
=
vcombine_u16
(
vqmovn_u32
(
vld1q_u32
(
row1
+
x
+
8
)),
vqmovn_u32
(
vld1q_u32
(
row1
+
x
+
12
)));
v_r2
=
vcombine_u16
(
vqmovn_u32
(
vld1q_u32
(
row2
+
x
+
8
)),
vqmovn_u32
(
vld1q_u32
(
row2
+
x
+
12
)));
v_r3
=
vcombine_u16
(
vqmovn_u32
(
vld1q_u32
(
row3
+
x
+
8
)),
vqmovn_u32
(
vld1q_u32
(
row3
+
x
+
12
)));
v_r4
=
vcombine_u16
(
vqmovn_u32
(
vld1q_u32
(
row4
+
x
+
8
)),
vqmovn_u32
(
vld1q_u32
(
row4
+
x
+
12
)));
v_r0
=
vaddq_u16
(
vaddq_u16
(
v_r0
,
v_r4
),
vaddq_u16
(
v_r2
,
v_r2
));
v_r1
=
vaddq_u16
(
vaddq_u16
(
v_r1
,
v_r2
),
v_r3
);
uint16x8_t
v_dst1
=
vaddq_u16
(
v_r0
,
vshlq_n_u16
(
v_r1
,
2
));
vst1q_u8
(
dst
+
x
,
vcombine_u8
(
vqmovn_u16
(
vshrq_n_u16
(
vaddq_u16
(
v_dst0
,
v_delta
),
8
)),
vqmovn_u16
(
vshrq_n_u16
(
vaddq_u16
(
v_dst1
,
v_delta
),
8
))));
}
return
x
;
}
};
struct
PyrDownVec_32s16u
{
int
operator
()(
int
**
src
,
ushort
*
dst
,
int
,
int
width
)
const
{
int
x
=
0
;
const
int
*
row0
=
src
[
0
],
*
row1
=
src
[
1
],
*
row2
=
src
[
2
],
*
row3
=
src
[
3
],
*
row4
=
src
[
4
];
int32x4_t
v_delta
=
vdupq_n_s32
(
128
);
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
int32x4_t
v_r00
=
vld1q_s32
(
row0
+
x
),
v_r01
=
vld1q_s32
(
row0
+
x
+
4
);
int32x4_t
v_r10
=
vld1q_s32
(
row1
+
x
),
v_r11
=
vld1q_s32
(
row1
+
x
+
4
);
int32x4_t
v_r20
=
vld1q_s32
(
row2
+
x
),
v_r21
=
vld1q_s32
(
row2
+
x
+
4
);
int32x4_t
v_r30
=
vld1q_s32
(
row3
+
x
),
v_r31
=
vld1q_s32
(
row3
+
x
+
4
);
int32x4_t
v_r40
=
vld1q_s32
(
row4
+
x
),
v_r41
=
vld1q_s32
(
row4
+
x
+
4
);
v_r00
=
vaddq_s32
(
vaddq_s32
(
v_r00
,
v_r40
),
vaddq_s32
(
v_r20
,
v_r20
));
v_r10
=
vaddq_s32
(
vaddq_s32
(
v_r10
,
v_r20
),
v_r30
);
v_r10
=
vshlq_n_s32
(
v_r10
,
2
);
int32x4_t
v_dst0
=
vshrq_n_s32
(
vaddq_s32
(
vaddq_s32
(
v_r00
,
v_r10
),
v_delta
),
8
);
v_r01
=
vaddq_s32
(
vaddq_s32
(
v_r01
,
v_r41
),
vaddq_s32
(
v_r21
,
v_r21
));
v_r11
=
vaddq_s32
(
vaddq_s32
(
v_r11
,
v_r21
),
v_r31
);
v_r11
=
vshlq_n_s32
(
v_r11
,
2
);
int32x4_t
v_dst1
=
vshrq_n_s32
(
vaddq_s32
(
vaddq_s32
(
v_r01
,
v_r11
),
v_delta
),
8
);
vst1q_u16
(
dst
+
x
,
vcombine_u16
(
vqmovun_s32
(
v_dst0
),
vqmovun_s32
(
v_dst1
)));
}
return
x
;
}
};
struct
PyrDownVec_32s16s
{
int
operator
()(
int
**
src
,
short
*
dst
,
int
,
int
width
)
const
{
int
x
=
0
;
const
int
*
row0
=
src
[
0
],
*
row1
=
src
[
1
],
*
row2
=
src
[
2
],
*
row3
=
src
[
3
],
*
row4
=
src
[
4
];
int32x4_t
v_delta
=
vdupq_n_s32
(
128
);
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
int32x4_t
v_r00
=
vld1q_s32
(
row0
+
x
),
v_r01
=
vld1q_s32
(
row0
+
x
+
4
);
int32x4_t
v_r10
=
vld1q_s32
(
row1
+
x
),
v_r11
=
vld1q_s32
(
row1
+
x
+
4
);
int32x4_t
v_r20
=
vld1q_s32
(
row2
+
x
),
v_r21
=
vld1q_s32
(
row2
+
x
+
4
);
int32x4_t
v_r30
=
vld1q_s32
(
row3
+
x
),
v_r31
=
vld1q_s32
(
row3
+
x
+
4
);
int32x4_t
v_r40
=
vld1q_s32
(
row4
+
x
),
v_r41
=
vld1q_s32
(
row4
+
x
+
4
);
v_r00
=
vaddq_s32
(
vaddq_s32
(
v_r00
,
v_r40
),
vaddq_s32
(
v_r20
,
v_r20
));
v_r10
=
vaddq_s32
(
vaddq_s32
(
v_r10
,
v_r20
),
v_r30
);
v_r10
=
vshlq_n_s32
(
v_r10
,
2
);
int32x4_t
v_dst0
=
vshrq_n_s32
(
vaddq_s32
(
vaddq_s32
(
v_r00
,
v_r10
),
v_delta
),
8
);
v_r01
=
vaddq_s32
(
vaddq_s32
(
v_r01
,
v_r41
),
vaddq_s32
(
v_r21
,
v_r21
));
v_r11
=
vaddq_s32
(
vaddq_s32
(
v_r11
,
v_r21
),
v_r31
);
v_r11
=
vshlq_n_s32
(
v_r11
,
2
);
int32x4_t
v_dst1
=
vshrq_n_s32
(
vaddq_s32
(
vaddq_s32
(
v_r01
,
v_r11
),
v_delta
),
8
);
vst1q_s16
(
dst
+
x
,
vcombine_s16
(
vqmovn_s32
(
v_dst0
),
vqmovn_s32
(
v_dst1
)));
}
return
x
;
}
};
struct
PyrDownVec_32f
{
int
operator
()(
float
**
src
,
float
*
dst
,
int
,
int
width
)
const
{
int
x
=
0
;
const
float
*
row0
=
src
[
0
],
*
row1
=
src
[
1
],
*
row2
=
src
[
2
],
*
row3
=
src
[
3
],
*
row4
=
src
[
4
];
float32x4_t
v_4
=
vdupq_n_f32
(
4.0
f
),
v_scale
=
vdupq_n_f32
(
1.
f
/
256.0
f
);
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
float32x4_t
v_r0
=
vld1q_f32
(
row0
+
x
);
float32x4_t
v_r1
=
vld1q_f32
(
row1
+
x
);
float32x4_t
v_r2
=
vld1q_f32
(
row2
+
x
);
float32x4_t
v_r3
=
vld1q_f32
(
row3
+
x
);
float32x4_t
v_r4
=
vld1q_f32
(
row4
+
x
);
v_r0
=
vaddq_f32
(
vaddq_f32
(
v_r0
,
v_r4
),
vaddq_f32
(
v_r2
,
v_r2
));
v_r1
=
vaddq_f32
(
vaddq_f32
(
v_r1
,
v_r2
),
v_r3
);
vst1q_f32
(
dst
+
x
,
vmulq_f32
(
vmlaq_f32
(
v_r0
,
v_4
,
v_r1
),
v_scale
));
v_r0
=
vld1q_f32
(
row0
+
x
+
4
);
v_r1
=
vld1q_f32
(
row1
+
x
+
4
);
v_r2
=
vld1q_f32
(
row2
+
x
+
4
);
v_r3
=
vld1q_f32
(
row3
+
x
+
4
);
v_r4
=
vld1q_f32
(
row4
+
x
+
4
);
v_r0
=
vaddq_f32
(
vaddq_f32
(
v_r0
,
v_r4
),
vaddq_f32
(
v_r2
,
v_r2
));
v_r1
=
vaddq_f32
(
vaddq_f32
(
v_r1
,
v_r2
),
v_r3
);
vst1q_f32
(
dst
+
x
+
4
,
vmulq_f32
(
vmlaq_f32
(
v_r0
,
v_4
,
v_r1
),
v_scale
));
}
return
x
;
}
};
struct
PyrUpVec_32s8u
{
int
operator
()(
int
**
src
,
uchar
**
dst
,
int
,
int
width
)
const
{
int
x
=
0
;
uchar
*
dst0
=
dst
[
0
],
*
dst1
=
dst
[
1
];
const
uint
*
row0
=
(
uint
*
)
src
[
0
],
*
row1
=
(
uint
*
)
src
[
1
],
*
row2
=
(
uint
*
)
src
[
2
];
uint16x8_t
v_delta
=
vdupq_n_u16
(
32
);
for
(
;
x
<=
width
-
16
;
x
+=
16
)
{
uint16x8_t
v_r0
=
vcombine_u16
(
vqmovn_u32
(
vld1q_u32
(
row0
+
x
)),
vqmovn_u32
(
vld1q_u32
(
row0
+
x
+
4
)));
uint16x8_t
v_r1
=
vcombine_u16
(
vqmovn_u32
(
vld1q_u32
(
row1
+
x
)),
vqmovn_u32
(
vld1q_u32
(
row1
+
x
+
4
)));
uint16x8_t
v_r2
=
vcombine_u16
(
vqmovn_u32
(
vld1q_u32
(
row2
+
x
)),
vqmovn_u32
(
vld1q_u32
(
row2
+
x
+
4
)));
uint16x8_t
v_2r1
=
vaddq_u16
(
v_r1
,
v_r1
),
v_4r1
=
vaddq_u16
(
v_2r1
,
v_2r1
);
uint16x8_t
v_dst00
=
vaddq_u16
(
vaddq_u16
(
v_r0
,
v_r2
),
vaddq_u16
(
v_2r1
,
v_4r1
));
uint16x8_t
v_dst10
=
vshlq_n_u16
(
vaddq_u16
(
v_r1
,
v_r2
),
2
);
v_r0
=
vcombine_u16
(
vqmovn_u32
(
vld1q_u32
(
row0
+
x
+
8
)),
vqmovn_u32
(
vld1q_u32
(
row0
+
x
+
12
)));
v_r1
=
vcombine_u16
(
vqmovn_u32
(
vld1q_u32
(
row1
+
x
+
8
)),
vqmovn_u32
(
vld1q_u32
(
row1
+
x
+
12
)));
v_r2
=
vcombine_u16
(
vqmovn_u32
(
vld1q_u32
(
row2
+
x
+
8
)),
vqmovn_u32
(
vld1q_u32
(
row2
+
x
+
12
)));
v_2r1
=
vaddq_u16
(
v_r1
,
v_r1
),
v_4r1
=
vaddq_u16
(
v_2r1
,
v_2r1
);
uint16x8_t
v_dst01
=
vaddq_u16
(
vaddq_u16
(
v_r0
,
v_r2
),
vaddq_u16
(
v_2r1
,
v_4r1
));
uint16x8_t
v_dst11
=
vshlq_n_u16
(
vaddq_u16
(
v_r1
,
v_r2
),
2
);
vst1q_u8
(
dst0
+
x
,
vcombine_u8
(
vqmovn_u16
(
vshrq_n_u16
(
vaddq_u16
(
v_dst00
,
v_delta
),
6
)),
vqmovn_u16
(
vshrq_n_u16
(
vaddq_u16
(
v_dst01
,
v_delta
),
6
))));
vst1q_u8
(
dst1
+
x
,
vcombine_u8
(
vqmovn_u16
(
vshrq_n_u16
(
vaddq_u16
(
v_dst10
,
v_delta
),
6
)),
vqmovn_u16
(
vshrq_n_u16
(
vaddq_u16
(
v_dst11
,
v_delta
),
6
))));
}
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
uint16x8_t
v_r0
=
vcombine_u16
(
vqmovn_u32
(
vld1q_u32
(
row0
+
x
)),
vqmovn_u32
(
vld1q_u32
(
row0
+
x
+
4
)));
uint16x8_t
v_r1
=
vcombine_u16
(
vqmovn_u32
(
vld1q_u32
(
row1
+
x
)),
vqmovn_u32
(
vld1q_u32
(
row1
+
x
+
4
)));
uint16x8_t
v_r2
=
vcombine_u16
(
vqmovn_u32
(
vld1q_u32
(
row2
+
x
)),
vqmovn_u32
(
vld1q_u32
(
row2
+
x
+
4
)));
uint16x8_t
v_2r1
=
vaddq_u16
(
v_r1
,
v_r1
),
v_4r1
=
vaddq_u16
(
v_2r1
,
v_2r1
);
uint16x8_t
v_dst0
=
vaddq_u16
(
vaddq_u16
(
v_r0
,
v_r2
),
vaddq_u16
(
v_2r1
,
v_4r1
));
uint16x8_t
v_dst1
=
vshlq_n_u16
(
vaddq_u16
(
v_r1
,
v_r2
),
2
);
vst1_u8
(
dst0
+
x
,
vqmovn_u16
(
vshrq_n_u16
(
vaddq_u16
(
v_dst0
,
v_delta
),
6
)));
vst1_u8
(
dst1
+
x
,
vqmovn_u16
(
vshrq_n_u16
(
vaddq_u16
(
v_dst1
,
v_delta
),
6
)));
}
return
x
;
}
};
struct
PyrUpVec_32s16u
{
int
operator
()(
int
**
src
,
ushort
**
dst
,
int
,
int
width
)
const
{
int
x
=
0
;
ushort
*
dst0
=
dst
[
0
],
*
dst1
=
dst
[
1
];
const
uint
*
row0
=
(
uint
*
)
src
[
0
],
*
row1
=
(
uint
*
)
src
[
1
],
*
row2
=
(
uint
*
)
src
[
2
];
uint32x4_t
v_delta
=
vdupq_n_u32
(
32
);
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
uint32x4_t
v_r0
=
vld1q_u32
(
row0
+
x
),
v_r1
=
vld1q_u32
(
row1
+
x
),
v_r2
=
vld1q_u32
(
row2
+
x
);
uint32x4_t
v_2r1
=
vshlq_n_u32
(
v_r1
,
1
),
v_4r1
=
vshlq_n_u32
(
v_r1
,
2
);
uint32x4_t
v_dst00
=
vaddq_u32
(
vaddq_u32
(
v_r0
,
v_r2
),
vaddq_u32
(
v_2r1
,
v_4r1
));
uint32x4_t
v_dst10
=
vshlq_n_u32
(
vaddq_u32
(
v_r1
,
v_r2
),
2
);
v_r0
=
vld1q_u32
(
row0
+
x
+
4
);
v_r1
=
vld1q_u32
(
row1
+
x
+
4
);
v_r2
=
vld1q_u32
(
row2
+
x
+
4
);
v_2r1
=
vshlq_n_u32
(
v_r1
,
1
);
v_4r1
=
vshlq_n_u32
(
v_r1
,
2
);
uint32x4_t
v_dst01
=
vaddq_u32
(
vaddq_u32
(
v_r0
,
v_r2
),
vaddq_u32
(
v_2r1
,
v_4r1
));
uint32x4_t
v_dst11
=
vshlq_n_u32
(
vaddq_u32
(
v_r1
,
v_r2
),
2
);
vst1q_u16
(
dst0
+
x
,
vcombine_u16
(
vmovn_u32
(
vshrq_n_u32
(
vaddq_u32
(
v_dst00
,
v_delta
),
6
)),
vmovn_u32
(
vshrq_n_u32
(
vaddq_u32
(
v_dst01
,
v_delta
),
6
))));
vst1q_u16
(
dst1
+
x
,
vcombine_u16
(
vmovn_u32
(
vshrq_n_u32
(
vaddq_u32
(
v_dst10
,
v_delta
),
6
)),
vmovn_u32
(
vshrq_n_u32
(
vaddq_u32
(
v_dst11
,
v_delta
),
6
))));
}
for
(
;
x
<=
width
-
4
;
x
+=
4
)
{
uint32x4_t
v_r0
=
vld1q_u32
(
row0
+
x
),
v_r1
=
vld1q_u32
(
row1
+
x
),
v_r2
=
vld1q_u32
(
row2
+
x
);
uint32x4_t
v_2r1
=
vshlq_n_u32
(
v_r1
,
1
),
v_4r1
=
vshlq_n_u32
(
v_r1
,
2
);
uint32x4_t
v_dst0
=
vaddq_u32
(
vaddq_u32
(
v_r0
,
v_r2
),
vaddq_u32
(
v_2r1
,
v_4r1
));
uint32x4_t
v_dst1
=
vshlq_n_u32
(
vaddq_u32
(
v_r1
,
v_r2
),
2
);
vst1_u16
(
dst0
+
x
,
vmovn_u32
(
vshrq_n_u32
(
vaddq_u32
(
v_dst0
,
v_delta
),
6
)));
vst1_u16
(
dst1
+
x
,
vmovn_u32
(
vshrq_n_u32
(
vaddq_u32
(
v_dst1
,
v_delta
),
6
)));
}
return
x
;
}
};
struct
PyrUpVec_32s16s
{
int
operator
()(
int
**
src
,
short
**
dst
,
int
,
int
width
)
const
{
int
x
=
0
;
short
*
dst0
=
dst
[
0
],
*
dst1
=
dst
[
1
];
const
int
*
row0
=
src
[
0
],
*
row1
=
src
[
1
],
*
row2
=
src
[
2
];
int32x4_t
v_delta
=
vdupq_n_s32
(
32
);
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
int32x4_t
v_r0
=
vld1q_s32
(
row0
+
x
),
v_r1
=
vld1q_s32
(
row1
+
x
),
v_r2
=
vld1q_s32
(
row2
+
x
);
int32x4_t
v_2r1
=
vshlq_n_s32
(
v_r1
,
1
),
v_4r1
=
vshlq_n_s32
(
v_r1
,
2
);
int32x4_t
v_dst00
=
vaddq_s32
(
vaddq_s32
(
v_r0
,
v_r2
),
vaddq_s32
(
v_2r1
,
v_4r1
));
int32x4_t
v_dst10
=
vshlq_n_s32
(
vaddq_s32
(
v_r1
,
v_r2
),
2
);
v_r0
=
vld1q_s32
(
row0
+
x
+
4
);
v_r1
=
vld1q_s32
(
row1
+
x
+
4
);
v_r2
=
vld1q_s32
(
row2
+
x
+
4
);
v_2r1
=
vshlq_n_s32
(
v_r1
,
1
);
v_4r1
=
vshlq_n_s32
(
v_r1
,
2
);
int32x4_t
v_dst01
=
vaddq_s32
(
vaddq_s32
(
v_r0
,
v_r2
),
vaddq_s32
(
v_2r1
,
v_4r1
));
int32x4_t
v_dst11
=
vshlq_n_s32
(
vaddq_s32
(
v_r1
,
v_r2
),
2
);
vst1q_s16
(
dst0
+
x
,
vcombine_s16
(
vqmovn_s32
(
vshrq_n_s32
(
vaddq_s32
(
v_dst00
,
v_delta
),
6
)),
vqmovn_s32
(
vshrq_n_s32
(
vaddq_s32
(
v_dst01
,
v_delta
),
6
))));
vst1q_s16
(
dst1
+
x
,
vcombine_s16
(
vqmovn_s32
(
vshrq_n_s32
(
vaddq_s32
(
v_dst10
,
v_delta
),
6
)),
vqmovn_s32
(
vshrq_n_s32
(
vaddq_s32
(
v_dst11
,
v_delta
),
6
))));
}
for
(
;
x
<=
width
-
4
;
x
+=
4
)
{
int32x4_t
v_r0
=
vld1q_s32
(
row0
+
x
),
v_r1
=
vld1q_s32
(
row1
+
x
),
v_r2
=
vld1q_s32
(
row2
+
x
);
int32x4_t
v_2r1
=
vshlq_n_s32
(
v_r1
,
1
),
v_4r1
=
vshlq_n_s32
(
v_r1
,
2
);
int32x4_t
v_dst0
=
vaddq_s32
(
vaddq_s32
(
v_r0
,
v_r2
),
vaddq_s32
(
v_2r1
,
v_4r1
));
int32x4_t
v_dst1
=
vshlq_n_s32
(
vaddq_s32
(
v_r1
,
v_r2
),
2
);
vst1_s16
(
dst0
+
x
,
vqmovn_s32
(
vshrq_n_s32
(
vaddq_s32
(
v_dst0
,
v_delta
),
6
)));
vst1_s16
(
dst1
+
x
,
vqmovn_s32
(
vshrq_n_s32
(
vaddq_s32
(
v_dst1
,
v_delta
),
6
)));
}
return
x
;
}
};
struct
PyrUpVec_32f
{
int
operator
()(
float
**
src
,
float
**
dst
,
int
,
int
width
)
const
...
...
@@ -811,23 +360,15 @@ struct PyrUpVec_32f
int
x
=
0
;
const
float
*
row0
=
src
[
0
],
*
row1
=
src
[
1
],
*
row2
=
src
[
2
];
float
*
dst0
=
dst
[
0
],
*
dst1
=
dst
[
1
];
float32x4_t
v_6
=
vdupq_n_f32
(
6.0
f
),
v_scale
=
vdupq_n_f32
(
1.
f
/
64.0
f
),
v_scale4
=
vmulq_n_f32
(
v_scale
,
4.0
f
);
for
(
;
x
<=
width
-
8
;
x
+=
8
)
v_float32
v_6
=
vx_setall_f32
(
6.0
f
),
v_scale
=
vx_setall_f32
(
1.
f
/
64.
f
),
v_scale4
=
vx_setall_f32
(
1.
f
/
16.
f
);
for
(
;
x
<=
width
-
v_float32
::
nlanes
;
x
+=
v_float32
::
nlanes
)
{
float32x4_t
v_r0
=
vld1q_f32
(
row0
+
x
);
float32x4_t
v_r1
=
vld1q_f32
(
row1
+
x
);
float32x4_t
v_r2
=
vld1q_f32
(
row2
+
x
);
vst1q_f32
(
dst1
+
x
,
vmulq_f32
(
v_scale4
,
vaddq_f32
(
v_r1
,
v_r2
)));
vst1q_f32
(
dst0
+
x
,
vmulq_f32
(
v_scale
,
vaddq_f32
(
vmlaq_f32
(
v_r0
,
v_6
,
v_r1
),
v_r2
)));
v_r0
=
vld1q_f32
(
row0
+
x
+
4
);
v_r1
=
vld1q_f32
(
row1
+
x
+
4
);
v_r2
=
vld1q_f32
(
row2
+
x
+
4
);
vst1q_f32
(
dst1
+
x
+
4
,
vmulq_f32
(
v_scale4
,
vaddq_f32
(
v_r1
,
v_r2
)));
vst1q_f32
(
dst0
+
x
+
4
,
vmulq_f32
(
v_scale
,
vaddq_f32
(
vmlaq_f32
(
v_r0
,
v_6
,
v_r1
),
v_r2
)));
v_float32
v_r0
=
vx_load
(
row0
+
x
),
v_r1
=
vx_load
(
row1
+
x
),
v_r2
=
vx_load
(
row2
+
x
);
v_store
(
dst1
+
x
,
v_scale4
*
(
v_r1
+
v_r2
));
v_store
(
dst0
+
x
,
v_scale
*
(
v_muladd
(
v_6
,
v_r1
,
v_r0
)
+
v_r2
));
}
return
x
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment