Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
9d602f27
Commit
9d602f27
authored
Sep 25, 2018
by
Vitaly Tuzov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Replaced SSE2 area resize implementation with wide universal intrinsic implementation
parent
ceca139f
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
458 additions
and
898 deletions
+458
-898
resize.cpp
modules/imgproc/src/resize.cpp
+458
-898
No files found.
modules/imgproc/src/resize.cpp
View file @
9d602f27
...
...
@@ -1181,760 +1181,247 @@ struct HResizeNoVec
const
uchar
*
,
int
,
int
,
int
,
int
,
int
)
const
{
return
0
;
}
};
#if CV_S
SE2
#if CV_S
IMD
struct
VResizeLinearVec_32s8u
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
dst
,
const
uchar
*
_beta
,
int
width
)
const
{
if
(
!
checkHardwareSupport
(
CV_CPU_SSE2
)
)
return
0
;
const
int
**
src
=
(
const
int
**
)
_src
;
const
short
*
beta
=
(
const
short
*
)
_beta
;
const
int
*
S0
=
src
[
0
],
*
S1
=
src
[
1
];
int
x
=
0
;
__m128i
b0
=
_mm_set1_epi16
(
beta
[
0
]),
b1
=
_mm_set1_epi16
(
beta
[
1
]);
__m128i
delta
=
_mm_set1_epi16
(
2
);
if
(
(((
size_t
)
S0
|
(
size_t
)
S1
)
&
15
)
==
0
)
for
(
;
x
<=
width
-
16
;
x
+=
16
)
{
__m128i
x0
,
x1
,
x2
,
y0
,
y1
,
y2
;
x0
=
_mm_load_si128
((
const
__m128i
*
)(
S0
+
x
));
x1
=
_mm_load_si128
((
const
__m128i
*
)(
S0
+
x
+
4
));
y0
=
_mm_load_si128
((
const
__m128i
*
)(
S1
+
x
));
y1
=
_mm_load_si128
((
const
__m128i
*
)(
S1
+
x
+
4
));
x0
=
_mm_packs_epi32
(
_mm_srai_epi32
(
x0
,
4
),
_mm_srai_epi32
(
x1
,
4
));
y0
=
_mm_packs_epi32
(
_mm_srai_epi32
(
y0
,
4
),
_mm_srai_epi32
(
y1
,
4
));
x1
=
_mm_load_si128
((
const
__m128i
*
)(
S0
+
x
+
8
));
x2
=
_mm_load_si128
((
const
__m128i
*
)(
S0
+
x
+
12
));
y1
=
_mm_load_si128
((
const
__m128i
*
)(
S1
+
x
+
8
));
y2
=
_mm_load_si128
((
const
__m128i
*
)(
S1
+
x
+
12
));
x1
=
_mm_packs_epi32
(
_mm_srai_epi32
(
x1
,
4
),
_mm_srai_epi32
(
x2
,
4
));
y1
=
_mm_packs_epi32
(
_mm_srai_epi32
(
y1
,
4
),
_mm_srai_epi32
(
y2
,
4
));
x0
=
_mm_adds_epi16
(
_mm_mulhi_epi16
(
x0
,
b0
),
_mm_mulhi_epi16
(
y0
,
b1
));
x1
=
_mm_adds_epi16
(
_mm_mulhi_epi16
(
x1
,
b0
),
_mm_mulhi_epi16
(
y1
,
b1
));
x0
=
_mm_srai_epi16
(
_mm_adds_epi16
(
x0
,
delta
),
2
);
x1
=
_mm_srai_epi16
(
_mm_adds_epi16
(
x1
,
delta
),
2
);
_mm_storeu_si128
(
(
__m128i
*
)(
dst
+
x
),
_mm_packus_epi16
(
x0
,
x1
));
}
v_int16
b0
=
vx_setall_s16
(
beta
[
0
]),
b1
=
vx_setall_s16
(
beta
[
1
]);
if
(
(((
size_t
)
S0
|
(
size_t
)
S1
)
&
(
CV_SIMD_WIDTH
-
1
))
==
0
)
for
(
;
x
<=
width
-
v_uint8
::
nlanes
;
x
+=
v_uint8
::
nlanes
)
v_store
(
dst
+
x
,
v_rshr_pack_u
<
2
>
(
v_mul_hi
(
v_pack
(
vx_load_aligned
(
S0
+
x
)
>>
4
,
vx_load_aligned
(
S0
+
x
+
v_int32
::
nlanes
)
>>
4
),
b0
)
+
v_mul_hi
(
v_pack
(
vx_load_aligned
(
S1
+
x
)
>>
4
,
vx_load_aligned
(
S1
+
x
+
v_int32
::
nlanes
)
>>
4
),
b1
),
v_mul_hi
(
v_pack
(
vx_load_aligned
(
S0
+
x
+
2
*
v_int32
::
nlanes
)
>>
4
,
vx_load_aligned
(
S0
+
x
+
3
*
v_int32
::
nlanes
)
>>
4
),
b0
)
+
v_mul_hi
(
v_pack
(
vx_load_aligned
(
S1
+
x
+
2
*
v_int32
::
nlanes
)
>>
4
,
vx_load_aligned
(
S1
+
x
+
3
*
v_int32
::
nlanes
)
>>
4
),
b1
)));
else
for
(
;
x
<=
width
-
16
;
x
+=
16
)
{
__m128i
x0
,
x1
,
x2
,
y0
,
y1
,
y2
;
x0
=
_mm_loadu_si128
((
const
__m128i
*
)(
S0
+
x
));
x1
=
_mm_loadu_si128
((
const
__m128i
*
)(
S0
+
x
+
4
));
y0
=
_mm_loadu_si128
((
const
__m128i
*
)(
S1
+
x
));
y1
=
_mm_loadu_si128
((
const
__m128i
*
)(
S1
+
x
+
4
));
x0
=
_mm_packs_epi32
(
_mm_srai_epi32
(
x0
,
4
),
_mm_srai_epi32
(
x1
,
4
));
y0
=
_mm_packs_epi32
(
_mm_srai_epi32
(
y0
,
4
),
_mm_srai_epi32
(
y1
,
4
));
x1
=
_mm_loadu_si128
((
const
__m128i
*
)(
S0
+
x
+
8
));
x2
=
_mm_loadu_si128
((
const
__m128i
*
)(
S0
+
x
+
12
));
y1
=
_mm_loadu_si128
((
const
__m128i
*
)(
S1
+
x
+
8
));
y2
=
_mm_loadu_si128
((
const
__m128i
*
)(
S1
+
x
+
12
));
x1
=
_mm_packs_epi32
(
_mm_srai_epi32
(
x1
,
4
),
_mm_srai_epi32
(
x2
,
4
));
y1
=
_mm_packs_epi32
(
_mm_srai_epi32
(
y1
,
4
),
_mm_srai_epi32
(
y2
,
4
));
x0
=
_mm_adds_epi16
(
_mm_mulhi_epi16
(
x0
,
b0
),
_mm_mulhi_epi16
(
y0
,
b1
));
x1
=
_mm_adds_epi16
(
_mm_mulhi_epi16
(
x1
,
b0
),
_mm_mulhi_epi16
(
y1
,
b1
));
x0
=
_mm_srai_epi16
(
_mm_adds_epi16
(
x0
,
delta
),
2
);
x1
=
_mm_srai_epi16
(
_mm_adds_epi16
(
x1
,
delta
),
2
);
_mm_storeu_si128
(
(
__m128i
*
)(
dst
+
x
),
_mm_packus_epi16
(
x0
,
x1
));
}
for
(
;
x
<=
width
-
v_uint8
::
nlanes
;
x
+=
v_uint8
::
nlanes
)
v_store
(
dst
+
x
,
v_rshr_pack_u
<
2
>
(
v_mul_hi
(
v_pack
(
vx_load
(
S0
+
x
)
>>
4
,
vx_load
(
S0
+
x
+
v_int32
::
nlanes
)
>>
4
),
b0
)
+
v_mul_hi
(
v_pack
(
vx_load
(
S1
+
x
)
>>
4
,
vx_load
(
S1
+
x
+
v_int32
::
nlanes
)
>>
4
),
b1
),
v_mul_hi
(
v_pack
(
vx_load
(
S0
+
x
+
2
*
v_int32
::
nlanes
)
>>
4
,
vx_load
(
S0
+
x
+
3
*
v_int32
::
nlanes
)
>>
4
),
b0
)
+
v_mul_hi
(
v_pack
(
vx_load
(
S1
+
x
+
2
*
v_int32
::
nlanes
)
>>
4
,
vx_load
(
S1
+
x
+
3
*
v_int32
::
nlanes
)
>>
4
),
b1
)));
for
(
;
x
<
width
-
4
;
x
+=
4
)
{
__m128i
x0
,
y0
;
x0
=
_mm_srai_epi32
(
_mm_loadu_si128
((
const
__m128i
*
)(
S0
+
x
)),
4
);
y0
=
_mm_srai_epi32
(
_mm_loadu_si128
((
const
__m128i
*
)(
S1
+
x
)),
4
);
x0
=
_mm_packs_epi32
(
x0
,
x0
);
y0
=
_mm_packs_epi32
(
y0
,
y0
);
x0
=
_mm_adds_epi16
(
_mm_mulhi_epi16
(
x0
,
b0
),
_mm_mulhi_epi16
(
y0
,
b1
));
x0
=
_mm_srai_epi16
(
_mm_adds_epi16
(
x0
,
delta
),
2
);
x0
=
_mm_packus_epi16
(
x0
,
x0
);
*
(
int
*
)(
dst
+
x
)
=
_mm_cvtsi128_si32
(
x0
);
}
for
(
;
x
<
width
-
v_int16
::
nlanes
;
x
+=
v_int16
::
nlanes
)
v_rshr_pack_u_store
<
2
>
(
dst
+
x
,
v_mul_hi
(
v_pack
(
vx_load
(
S0
+
x
)
>>
4
,
vx_load
(
S0
+
x
+
v_int32
::
nlanes
)
>>
4
),
b0
)
+
v_mul_hi
(
v_pack
(
vx_load
(
S1
+
x
)
>>
4
,
vx_load
(
S1
+
x
+
v_int32
::
nlanes
)
>>
4
),
b1
));
return
x
;
}
};
template
<
int
shiftval
>
struct
VResizeLinearVec_32f16
struct
VResizeLinearVec_32f16u
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
const
{
if
(
!
checkHardwareSupport
(
CV_CPU_SSE2
)
)
return
0
;
const
float
**
src
=
(
const
float
**
)
_src
;
const
float
*
beta
=
(
const
float
*
)
_beta
;
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
];
ushort
*
dst
=
(
ushort
*
)
_dst
;
int
x
=
0
;
__m128
b0
=
_mm_set1_ps
(
beta
[
0
]),
b1
=
_mm_set1_ps
(
beta
[
1
]);
__m128i
preshift
=
_mm_set1_epi32
(
shiftval
);
__m128i
postshift
=
_mm_set1_epi16
((
short
)
shiftval
);
v_float32
b0
=
vx_setall_f32
(
beta
[
0
]),
b1
=
vx_setall_f32
(
beta
[
1
]);
if
(
(((
size_t
)
S0
|
(
size_t
)
S1
)
&
15
)
==
0
)
for
(
;
x
<=
width
-
16
;
x
+=
16
)
{
__m128
x0
,
x1
,
y0
,
y1
;
__m128i
t0
,
t1
,
t2
;
x0
=
_mm_load_ps
(
S0
+
x
);
x1
=
_mm_load_ps
(
S0
+
x
+
4
);
y0
=
_mm_load_ps
(
S1
+
x
);
y1
=
_mm_load_ps
(
S1
+
x
+
4
);
x0
=
_mm_add_ps
(
_mm_mul_ps
(
x0
,
b0
),
_mm_mul_ps
(
y0
,
b1
));
x1
=
_mm_add_ps
(
_mm_mul_ps
(
x1
,
b0
),
_mm_mul_ps
(
y1
,
b1
));
t0
=
_mm_add_epi32
(
_mm_cvtps_epi32
(
x0
),
preshift
);
t2
=
_mm_add_epi32
(
_mm_cvtps_epi32
(
x1
),
preshift
);
t0
=
_mm_add_epi16
(
_mm_packs_epi32
(
t0
,
t2
),
postshift
);
x0
=
_mm_load_ps
(
S0
+
x
+
8
);
x1
=
_mm_load_ps
(
S0
+
x
+
12
);
y0
=
_mm_load_ps
(
S1
+
x
+
8
);
y1
=
_mm_load_ps
(
S1
+
x
+
12
);
x0
=
_mm_add_ps
(
_mm_mul_ps
(
x0
,
b0
),
_mm_mul_ps
(
y0
,
b1
));
x1
=
_mm_add_ps
(
_mm_mul_ps
(
x1
,
b0
),
_mm_mul_ps
(
y1
,
b1
));
t1
=
_mm_add_epi32
(
_mm_cvtps_epi32
(
x0
),
preshift
);
t2
=
_mm_add_epi32
(
_mm_cvtps_epi32
(
x1
),
preshift
);
t1
=
_mm_add_epi16
(
_mm_packs_epi32
(
t1
,
t2
),
postshift
);
_mm_storeu_si128
(
(
__m128i
*
)(
dst
+
x
),
t0
);
_mm_storeu_si128
(
(
__m128i
*
)(
dst
+
x
+
8
),
t1
);
}
if
(
(((
size_t
)
S0
|
(
size_t
)
S1
)
&
(
CV_SIMD_WIDTH
-
1
))
==
0
)
for
(
;
x
<=
width
-
v_uint16
::
nlanes
;
x
+=
v_uint16
::
nlanes
)
v_store
(
dst
+
x
,
v_pack_u
(
v_round
(
v_muladd
(
vx_load_aligned
(
S0
+
x
),
b0
,
vx_load_aligned
(
S1
+
x
)
*
b1
)),
v_round
(
v_muladd
(
vx_load_aligned
(
S0
+
x
+
v_float32
::
nlanes
),
b0
,
vx_load_aligned
(
S1
+
x
+
v_float32
::
nlanes
)
*
b1
))));
else
for
(
;
x
<=
width
-
16
;
x
+=
16
)
{
__m128
x0
,
x1
,
y0
,
y1
;
__m128i
t0
,
t1
,
t2
;
x0
=
_mm_loadu_ps
(
S0
+
x
);
x1
=
_mm_loadu_ps
(
S0
+
x
+
4
);
y0
=
_mm_loadu_ps
(
S1
+
x
);
y1
=
_mm_loadu_ps
(
S1
+
x
+
4
);
x0
=
_mm_add_ps
(
_mm_mul_ps
(
x0
,
b0
),
_mm_mul_ps
(
y0
,
b1
));
x1
=
_mm_add_ps
(
_mm_mul_ps
(
x1
,
b0
),
_mm_mul_ps
(
y1
,
b1
));
t0
=
_mm_add_epi32
(
_mm_cvtps_epi32
(
x0
),
preshift
);
t2
=
_mm_add_epi32
(
_mm_cvtps_epi32
(
x1
),
preshift
);
t0
=
_mm_add_epi16
(
_mm_packs_epi32
(
t0
,
t2
),
postshift
);
x0
=
_mm_loadu_ps
(
S0
+
x
+
8
);
x1
=
_mm_loadu_ps
(
S0
+
x
+
12
);
y0
=
_mm_loadu_ps
(
S1
+
x
+
8
);
y1
=
_mm_loadu_ps
(
S1
+
x
+
12
);
x0
=
_mm_add_ps
(
_mm_mul_ps
(
x0
,
b0
),
_mm_mul_ps
(
y0
,
b1
));
x1
=
_mm_add_ps
(
_mm_mul_ps
(
x1
,
b0
),
_mm_mul_ps
(
y1
,
b1
));
t1
=
_mm_add_epi32
(
_mm_cvtps_epi32
(
x0
),
preshift
);
t2
=
_mm_add_epi32
(
_mm_cvtps_epi32
(
x1
),
preshift
);
t1
=
_mm_add_epi16
(
_mm_packs_epi32
(
t1
,
t2
),
postshift
);
_mm_storeu_si128
(
(
__m128i
*
)(
dst
+
x
),
t0
);
_mm_storeu_si128
(
(
__m128i
*
)(
dst
+
x
+
8
),
t1
);
}
for
(
;
x
<
width
-
4
;
x
+=
4
)
for
(;
x
<=
width
-
v_uint16
::
nlanes
;
x
+=
v_uint16
::
nlanes
)
v_store
(
dst
+
x
,
v_pack_u
(
v_round
(
v_muladd
(
vx_load
(
S0
+
x
),
b0
,
vx_load
(
S1
+
x
)
*
b1
)),
v_round
(
v_muladd
(
vx_load
(
S0
+
x
+
v_float32
::
nlanes
),
b0
,
vx_load
(
S1
+
x
+
v_float32
::
nlanes
)
*
b1
))));
for
(
;
x
<
width
-
v_float32
::
nlanes
;
x
+=
v_float32
::
nlanes
)
{
__m128
x0
,
y0
;
__m128i
t0
;
x0
=
_mm_loadu_ps
(
S0
+
x
);
y0
=
_mm_loadu_ps
(
S1
+
x
);
x0
=
_mm_add_ps
(
_mm_mul_ps
(
x0
,
b0
),
_mm_mul_ps
(
y0
,
b1
));
t0
=
_mm_add_epi32
(
_mm_cvtps_epi32
(
x0
),
preshift
);
t0
=
_mm_add_epi16
(
_mm_packs_epi32
(
t0
,
t0
),
postshift
);
_mm_storel_epi64
(
(
__m128i
*
)(
dst
+
x
),
t0
);
v_int32
t0
=
v_round
(
v_muladd
(
vx_load
(
S0
+
x
),
b0
,
vx_load
(
S1
+
x
)
*
b1
));
v_store_low
(
dst
+
x
,
v_pack_u
(
t0
,
t0
));
}
return
x
;
}
};
typedef
VResizeLinearVec_32f16
<
SHRT_MIN
>
VResizeLinearVec_32f16u
;
typedef
VResizeLinearVec_32f16
<
0
>
VResizeLinearVec_32f16s
;
struct
VResizeLinearVec_32f
struct
VResizeLinearVec_32f16s
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
const
{
if
(
!
checkHardwareSupport
(
CV_CPU_SSE
)
)
return
0
;
const
float
**
src
=
(
const
float
**
)
_src
;
const
float
*
beta
=
(
const
float
*
)
_beta
;
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
];
float
*
dst
=
(
floa
t
*
)
_dst
;
short
*
dst
=
(
shor
t
*
)
_dst
;
int
x
=
0
;
__m128
b0
=
_mm_set1_ps
(
beta
[
0
]),
b1
=
_mm_set1_ps
(
beta
[
1
]);
if
(
(((
size_t
)
S0
|
(
size_t
)
S1
)
&
15
)
==
0
)
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
__m128
x0
,
x1
,
y0
,
y1
;
x0
=
_mm_load_ps
(
S0
+
x
);
x1
=
_mm_load_ps
(
S0
+
x
+
4
);
y0
=
_mm_load_ps
(
S1
+
x
);
y1
=
_mm_load_ps
(
S1
+
x
+
4
);
x0
=
_mm_add_ps
(
_mm_mul_ps
(
x0
,
b0
),
_mm_mul_ps
(
y0
,
b1
));
x1
=
_mm_add_ps
(
_mm_mul_ps
(
x1
,
b0
),
_mm_mul_ps
(
y1
,
b1
));
_mm_storeu_ps
(
dst
+
x
,
x0
);
_mm_storeu_ps
(
dst
+
x
+
4
,
x1
);
}
else
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
__m128
x0
,
x1
,
y0
,
y1
;
x0
=
_mm_loadu_ps
(
S0
+
x
);
x1
=
_mm_loadu_ps
(
S0
+
x
+
4
);
y0
=
_mm_loadu_ps
(
S1
+
x
);
y1
=
_mm_loadu_ps
(
S1
+
x
+
4
);
x0
=
_mm_add_ps
(
_mm_mul_ps
(
x0
,
b0
),
_mm_mul_ps
(
y0
,
b1
));
x1
=
_mm_add_ps
(
_mm_mul_ps
(
x1
,
b0
),
_mm_mul_ps
(
y1
,
b1
));
_mm_storeu_ps
(
dst
+
x
,
x0
);
_mm_storeu_ps
(
dst
+
x
+
4
,
x1
);
}
return
x
;
}
};
struct
VResizeCubicVec_32s8u
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
dst
,
const
uchar
*
_beta
,
int
width
)
const
{
if
(
!
checkHardwareSupport
(
CV_CPU_SSE2
)
)
return
0
;
const
int
**
src
=
(
const
int
**
)
_src
;
const
short
*
beta
=
(
const
short
*
)
_beta
;
const
int
*
S0
=
src
[
0
],
*
S1
=
src
[
1
],
*
S2
=
src
[
2
],
*
S3
=
src
[
3
];
int
x
=
0
;
float
scale
=
1.
f
/
(
INTER_RESIZE_COEF_SCALE
*
INTER_RESIZE_COEF_SCALE
);
__m128
b0
=
_mm_set1_ps
(
beta
[
0
]
*
scale
),
b1
=
_mm_set1_ps
(
beta
[
1
]
*
scale
),
b2
=
_mm_set1_ps
(
beta
[
2
]
*
scale
),
b3
=
_mm_set1_ps
(
beta
[
3
]
*
scale
);
v_float32
b0
=
vx_setall_f32
(
beta
[
0
]),
b1
=
vx_setall_f32
(
beta
[
1
]);
if
(
(((
size_t
)
S0
|
(
size_t
)
S1
|
(
size_t
)
S2
|
(
size_t
)
S3
)
&
15
)
==
0
)
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
__m128i
x0
,
x1
,
y0
,
y1
;
__m128
s0
,
s1
,
f0
,
f1
;
x0
=
_mm_load_si128
((
const
__m128i
*
)(
S0
+
x
));
x1
=
_mm_load_si128
((
const
__m128i
*
)(
S0
+
x
+
4
));
y0
=
_mm_load_si128
((
const
__m128i
*
)(
S1
+
x
));
y1
=
_mm_load_si128
((
const
__m128i
*
)(
S1
+
x
+
4
));
s0
=
_mm_mul_ps
(
_mm_cvtepi32_ps
(
x0
),
b0
);
s1
=
_mm_mul_ps
(
_mm_cvtepi32_ps
(
x1
),
b0
);
f0
=
_mm_mul_ps
(
_mm_cvtepi32_ps
(
y0
),
b1
);
f1
=
_mm_mul_ps
(
_mm_cvtepi32_ps
(
y1
),
b1
);
s0
=
_mm_add_ps
(
s0
,
f0
);
s1
=
_mm_add_ps
(
s1
,
f1
);
x0
=
_mm_load_si128
((
const
__m128i
*
)(
S2
+
x
));
x1
=
_mm_load_si128
((
const
__m128i
*
)(
S2
+
x
+
4
));
y0
=
_mm_load_si128
((
const
__m128i
*
)(
S3
+
x
));
y1
=
_mm_load_si128
((
const
__m128i
*
)(
S3
+
x
+
4
));
f0
=
_mm_mul_ps
(
_mm_cvtepi32_ps
(
x0
),
b2
);
f1
=
_mm_mul_ps
(
_mm_cvtepi32_ps
(
x1
),
b2
);
s0
=
_mm_add_ps
(
s0
,
f0
);
s1
=
_mm_add_ps
(
s1
,
f1
);
f0
=
_mm_mul_ps
(
_mm_cvtepi32_ps
(
y0
),
b3
);
f1
=
_mm_mul_ps
(
_mm_cvtepi32_ps
(
y1
),
b3
);
s0
=
_mm_add_ps
(
s0
,
f0
);
s1
=
_mm_add_ps
(
s1
,
f1
);
x0
=
_mm_cvtps_epi32
(
s0
);
x1
=
_mm_cvtps_epi32
(
s1
);
x0
=
_mm_packs_epi32
(
x0
,
x1
);
_mm_storel_epi64
(
(
__m128i
*
)(
dst
+
x
),
_mm_packus_epi16
(
x0
,
x0
));
}
if
(
(((
size_t
)
S0
|
(
size_t
)
S1
)
&
(
CV_SIMD_WIDTH
-
1
))
==
0
)
for
(
;
x
<=
width
-
v_int16
::
nlanes
;
x
+=
v_int16
::
nlanes
)
v_store
(
dst
+
x
,
v_pack
(
v_round
(
v_muladd
(
vx_load_aligned
(
S0
+
x
),
b0
,
vx_load_aligned
(
S1
+
x
)
*
b1
)),
v_round
(
v_muladd
(
vx_load_aligned
(
S0
+
x
+
v_float32
::
nlanes
),
b0
,
vx_load_aligned
(
S1
+
x
+
v_float32
::
nlanes
)
*
b1
))));
else
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
__m128i
x0
,
x1
,
y0
,
y1
;
__m128
s0
,
s1
,
f0
,
f1
;
x0
=
_mm_loadu_si128
((
const
__m128i
*
)(
S0
+
x
));
x1
=
_mm_loadu_si128
((
const
__m128i
*
)(
S0
+
x
+
4
));
y0
=
_mm_loadu_si128
((
const
__m128i
*
)(
S1
+
x
));
y1
=
_mm_loadu_si128
((
const
__m128i
*
)(
S1
+
x
+
4
));
s0
=
_mm_mul_ps
(
_mm_cvtepi32_ps
(
x0
),
b0
);
s1
=
_mm_mul_ps
(
_mm_cvtepi32_ps
(
x1
),
b0
);
f0
=
_mm_mul_ps
(
_mm_cvtepi32_ps
(
y0
),
b1
);
f1
=
_mm_mul_ps
(
_mm_cvtepi32_ps
(
y1
),
b1
);
s0
=
_mm_add_ps
(
s0
,
f0
);
s1
=
_mm_add_ps
(
s1
,
f1
);
x0
=
_mm_loadu_si128
((
const
__m128i
*
)(
S2
+
x
));
x1
=
_mm_loadu_si128
((
const
__m128i
*
)(
S2
+
x
+
4
));
y0
=
_mm_loadu_si128
((
const
__m128i
*
)(
S3
+
x
));
y1
=
_mm_loadu_si128
((
const
__m128i
*
)(
S3
+
x
+
4
));
f0
=
_mm_mul_ps
(
_mm_cvtepi32_ps
(
x0
),
b2
);
f1
=
_mm_mul_ps
(
_mm_cvtepi32_ps
(
x1
),
b2
);
s0
=
_mm_add_ps
(
s0
,
f0
);
s1
=
_mm_add_ps
(
s1
,
f1
);
f0
=
_mm_mul_ps
(
_mm_cvtepi32_ps
(
y0
),
b3
);
f1
=
_mm_mul_ps
(
_mm_cvtepi32_ps
(
y1
),
b3
);
s0
=
_mm_add_ps
(
s0
,
f0
);
s1
=
_mm_add_ps
(
s1
,
f1
);
x0
=
_mm_cvtps_epi32
(
s0
);
x1
=
_mm_cvtps_epi32
(
s1
);
x0
=
_mm_packs_epi32
(
x0
,
x1
);
_mm_storel_epi64
(
(
__m128i
*
)(
dst
+
x
),
_mm_packus_epi16
(
x0
,
x0
));
}
return
x
;
}
};
template
<
int
shiftval
>
struct
VResizeCubicVec_32f16
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
const
{
if
(
!
checkHardwareSupport
(
CV_CPU_SSE2
)
)
return
0
;
const
float
**
src
=
(
const
float
**
)
_src
;
const
float
*
beta
=
(
const
float
*
)
_beta
;
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
],
*
S2
=
src
[
2
],
*
S3
=
src
[
3
];
ushort
*
dst
=
(
ushort
*
)
_dst
;
int
x
=
0
;
__m128
b0
=
_mm_set1_ps
(
beta
[
0
]),
b1
=
_mm_set1_ps
(
beta
[
1
]),
b2
=
_mm_set1_ps
(
beta
[
2
]),
b3
=
_mm_set1_ps
(
beta
[
3
]);
__m128i
preshift
=
_mm_set1_epi32
(
shiftval
);
__m128i
postshift
=
_mm_set1_epi16
((
short
)
shiftval
);
for
(
;
x
<=
width
-
8
;
x
+=
8
)
for
(;
x
<=
width
-
v_int16
::
nlanes
;
x
+=
v_int16
::
nlanes
)
v_store
(
dst
+
x
,
v_pack
(
v_round
(
v_muladd
(
vx_load
(
S0
+
x
),
b0
,
vx_load
(
S1
+
x
)
*
b1
)),
v_round
(
v_muladd
(
vx_load
(
S0
+
x
+
v_float32
::
nlanes
),
b0
,
vx_load
(
S1
+
x
+
v_float32
::
nlanes
)
*
b1
))));
for
(
;
x
<
width
-
v_float32
::
nlanes
;
x
+=
v_float32
::
nlanes
)
{
__m128
x0
,
x1
,
y0
,
y1
,
s0
,
s1
;
__m128i
t0
,
t1
;
x0
=
_mm_loadu_ps
(
S0
+
x
);
x1
=
_mm_loadu_ps
(
S0
+
x
+
4
);
y0
=
_mm_loadu_ps
(
S1
+
x
);
y1
=
_mm_loadu_ps
(
S1
+
x
+
4
);
s0
=
_mm_mul_ps
(
x0
,
b0
);
s1
=
_mm_mul_ps
(
x1
,
b0
);
y0
=
_mm_mul_ps
(
y0
,
b1
);
y1
=
_mm_mul_ps
(
y1
,
b1
);
s0
=
_mm_add_ps
(
s0
,
y0
);
s1
=
_mm_add_ps
(
s1
,
y1
);
x0
=
_mm_loadu_ps
(
S2
+
x
);
x1
=
_mm_loadu_ps
(
S2
+
x
+
4
);
y0
=
_mm_loadu_ps
(
S3
+
x
);
y1
=
_mm_loadu_ps
(
S3
+
x
+
4
);
x0
=
_mm_mul_ps
(
x0
,
b2
);
x1
=
_mm_mul_ps
(
x1
,
b2
);
y0
=
_mm_mul_ps
(
y0
,
b3
);
y1
=
_mm_mul_ps
(
y1
,
b3
);
s0
=
_mm_add_ps
(
s0
,
x0
);
s1
=
_mm_add_ps
(
s1
,
x1
);
s0
=
_mm_add_ps
(
s0
,
y0
);
s1
=
_mm_add_ps
(
s1
,
y1
);
t0
=
_mm_add_epi32
(
_mm_cvtps_epi32
(
s0
),
preshift
);
t1
=
_mm_add_epi32
(
_mm_cvtps_epi32
(
s1
),
preshift
);
t0
=
_mm_add_epi16
(
_mm_packs_epi32
(
t0
,
t1
),
postshift
);
_mm_storeu_si128
(
(
__m128i
*
)(
dst
+
x
),
t0
);
v_int32
t0
=
v_round
(
v_muladd
(
vx_load
(
S0
+
x
),
b0
,
vx_load
(
S1
+
x
)
*
b1
));
v_store_low
(
dst
+
x
,
v_pack
(
t0
,
t0
));
}
return
x
;
}
};
typedef
VResizeCubicVec_32f16
<
SHRT_MIN
>
VResizeCubicVec_32f16u
;
typedef
VResizeCubicVec_32f16
<
0
>
VResizeCubicVec_32f16s
;
struct
VResizeCubicVec_32f
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
const
{
if
(
!
checkHardwareSupport
(
CV_CPU_SSE
)
)
return
0
;
const
float
**
src
=
(
const
float
**
)
_src
;
const
float
*
beta
=
(
const
float
*
)
_beta
;
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
],
*
S2
=
src
[
2
],
*
S3
=
src
[
3
];
float
*
dst
=
(
float
*
)
_dst
;
int
x
=
0
;
__m128
b0
=
_mm_set1_ps
(
beta
[
0
]),
b1
=
_mm_set1_ps
(
beta
[
1
]),
b2
=
_mm_set1_ps
(
beta
[
2
]),
b3
=
_mm_set1_ps
(
beta
[
3
]);
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
__m128
x0
,
x1
,
y0
,
y1
,
s0
,
s1
;
x0
=
_mm_loadu_ps
(
S0
+
x
);
x1
=
_mm_loadu_ps
(
S0
+
x
+
4
);
y0
=
_mm_loadu_ps
(
S1
+
x
);
y1
=
_mm_loadu_ps
(
S1
+
x
+
4
);
s0
=
_mm_mul_ps
(
x0
,
b0
);
s1
=
_mm_mul_ps
(
x1
,
b0
);
y0
=
_mm_mul_ps
(
y0
,
b1
);
y1
=
_mm_mul_ps
(
y1
,
b1
);
s0
=
_mm_add_ps
(
s0
,
y0
);
s1
=
_mm_add_ps
(
s1
,
y1
);
x0
=
_mm_loadu_ps
(
S2
+
x
);
x1
=
_mm_loadu_ps
(
S2
+
x
+
4
);
y0
=
_mm_loadu_ps
(
S3
+
x
);
y1
=
_mm_loadu_ps
(
S3
+
x
+
4
);
x0
=
_mm_mul_ps
(
x0
,
b2
);
x1
=
_mm_mul_ps
(
x1
,
b2
);
y0
=
_mm_mul_ps
(
y0
,
b3
);
y1
=
_mm_mul_ps
(
y1
,
b3
);
s0
=
_mm_add_ps
(
s0
,
x0
);
s1
=
_mm_add_ps
(
s1
,
x1
);
s0
=
_mm_add_ps
(
s0
,
y0
);
s1
=
_mm_add_ps
(
s1
,
y1
);
_mm_storeu_ps
(
dst
+
x
,
s0
);
_mm_storeu_ps
(
dst
+
x
+
4
,
s1
);
}
return
x
;
}
};
#if CV_TRY_SSE4_1
struct
VResizeLanczos4Vec_32f16u
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
const
{
if
(
CV_CPU_HAS_SUPPORT_SSE4_1
)
return
opt_SSE4_1
::
VResizeLanczos4Vec_32f16u_SSE41
(
_src
,
_dst
,
_beta
,
width
);
else
return
0
;
}
};
#else
typedef
VResizeNoVec
VResizeLanczos4Vec_32f16u
;
#endif
struct
VResizeLanczos4Vec_32f16s
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
const
{
const
float
**
src
=
(
const
float
**
)
_src
;
const
float
*
beta
=
(
const
float
*
)
_beta
;
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
],
*
S2
=
src
[
2
],
*
S3
=
src
[
3
],
*
S4
=
src
[
4
],
*
S5
=
src
[
5
],
*
S6
=
src
[
6
],
*
S7
=
src
[
7
];
short
*
dst
=
(
short
*
)
_dst
;
int
x
=
0
;
__m128
v_b0
=
_mm_set1_ps
(
beta
[
0
]),
v_b1
=
_mm_set1_ps
(
beta
[
1
]),
v_b2
=
_mm_set1_ps
(
beta
[
2
]),
v_b3
=
_mm_set1_ps
(
beta
[
3
]),
v_b4
=
_mm_set1_ps
(
beta
[
4
]),
v_b5
=
_mm_set1_ps
(
beta
[
5
]),
v_b6
=
_mm_set1_ps
(
beta
[
6
]),
v_b7
=
_mm_set1_ps
(
beta
[
7
]);
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
__m128
v_dst0
=
_mm_mul_ps
(
v_b0
,
_mm_loadu_ps
(
S0
+
x
));
v_dst0
=
_mm_add_ps
(
v_dst0
,
_mm_mul_ps
(
v_b1
,
_mm_loadu_ps
(
S1
+
x
)));
v_dst0
=
_mm_add_ps
(
v_dst0
,
_mm_mul_ps
(
v_b2
,
_mm_loadu_ps
(
S2
+
x
)));
v_dst0
=
_mm_add_ps
(
v_dst0
,
_mm_mul_ps
(
v_b3
,
_mm_loadu_ps
(
S3
+
x
)));
v_dst0
=
_mm_add_ps
(
v_dst0
,
_mm_mul_ps
(
v_b4
,
_mm_loadu_ps
(
S4
+
x
)));
v_dst0
=
_mm_add_ps
(
v_dst0
,
_mm_mul_ps
(
v_b5
,
_mm_loadu_ps
(
S5
+
x
)));
v_dst0
=
_mm_add_ps
(
v_dst0
,
_mm_mul_ps
(
v_b6
,
_mm_loadu_ps
(
S6
+
x
)));
v_dst0
=
_mm_add_ps
(
v_dst0
,
_mm_mul_ps
(
v_b7
,
_mm_loadu_ps
(
S7
+
x
)));
__m128
v_dst1
=
_mm_mul_ps
(
v_b0
,
_mm_loadu_ps
(
S0
+
x
+
4
));
v_dst1
=
_mm_add_ps
(
v_dst1
,
_mm_mul_ps
(
v_b1
,
_mm_loadu_ps
(
S1
+
x
+
4
)));
v_dst1
=
_mm_add_ps
(
v_dst1
,
_mm_mul_ps
(
v_b2
,
_mm_loadu_ps
(
S2
+
x
+
4
)));
v_dst1
=
_mm_add_ps
(
v_dst1
,
_mm_mul_ps
(
v_b3
,
_mm_loadu_ps
(
S3
+
x
+
4
)));
v_dst1
=
_mm_add_ps
(
v_dst1
,
_mm_mul_ps
(
v_b4
,
_mm_loadu_ps
(
S4
+
x
+
4
)));
v_dst1
=
_mm_add_ps
(
v_dst1
,
_mm_mul_ps
(
v_b5
,
_mm_loadu_ps
(
S5
+
x
+
4
)));
v_dst1
=
_mm_add_ps
(
v_dst1
,
_mm_mul_ps
(
v_b6
,
_mm_loadu_ps
(
S6
+
x
+
4
)));
v_dst1
=
_mm_add_ps
(
v_dst1
,
_mm_mul_ps
(
v_b7
,
_mm_loadu_ps
(
S7
+
x
+
4
)));
__m128i
v_dsti0
=
_mm_cvtps_epi32
(
v_dst0
);
__m128i
v_dsti1
=
_mm_cvtps_epi32
(
v_dst1
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
x
),
_mm_packs_epi32
(
v_dsti0
,
v_dsti1
));
}
return
x
;
}
};
struct
VResizeLanczos4Vec_32f
struct
VResizeLinearVec_32f
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
const
{
const
float
**
src
=
(
const
float
**
)
_src
;
const
float
*
beta
=
(
const
float
*
)
_beta
;
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
],
*
S2
=
src
[
2
],
*
S3
=
src
[
3
],
*
S4
=
src
[
4
],
*
S5
=
src
[
5
],
*
S6
=
src
[
6
],
*
S7
=
src
[
7
];
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
];
float
*
dst
=
(
float
*
)
_dst
;
int
x
=
0
;
__m128
v_b0
=
_mm_set1_ps
(
beta
[
0
]),
v_b1
=
_mm_set1_ps
(
beta
[
1
]),
v_b2
=
_mm_set1_ps
(
beta
[
2
]),
v_b3
=
_mm_set1_ps
(
beta
[
3
]),
v_b4
=
_mm_set1_ps
(
beta
[
4
]),
v_b5
=
_mm_set1_ps
(
beta
[
5
]),
v_b6
=
_mm_set1_ps
(
beta
[
6
]),
v_b7
=
_mm_set1_ps
(
beta
[
7
]);
v_float32
b0
=
vx_setall_f32
(
beta
[
0
]),
b1
=
vx_setall_f32
(
beta
[
1
]);
for
(
;
x
<=
width
-
4
;
x
+=
4
)
{
__m128
v_dst
=
_mm_mul_ps
(
v_b0
,
_mm_loadu_ps
(
S0
+
x
));
v_dst
=
_mm_add_ps
(
v_dst
,
_mm_mul_ps
(
v_b1
,
_mm_loadu_ps
(
S1
+
x
)));
v_dst
=
_mm_add_ps
(
v_dst
,
_mm_mul_ps
(
v_b2
,
_mm_loadu_ps
(
S2
+
x
)));
v_dst
=
_mm_add_ps
(
v_dst
,
_mm_mul_ps
(
v_b3
,
_mm_loadu_ps
(
S3
+
x
)));
v_dst
=
_mm_add_ps
(
v_dst
,
_mm_mul_ps
(
v_b4
,
_mm_loadu_ps
(
S4
+
x
)));
v_dst
=
_mm_add_ps
(
v_dst
,
_mm_mul_ps
(
v_b5
,
_mm_loadu_ps
(
S5
+
x
)));
v_dst
=
_mm_add_ps
(
v_dst
,
_mm_mul_ps
(
v_b6
,
_mm_loadu_ps
(
S6
+
x
)));
v_dst
=
_mm_add_ps
(
v_dst
,
_mm_mul_ps
(
v_b7
,
_mm_loadu_ps
(
S7
+
x
)));
_mm_storeu_ps
(
dst
+
x
,
v_dst
);
}
if
(
(((
size_t
)
S0
|
(
size_t
)
S1
)
&
(
CV_SIMD_WIDTH
-
1
))
==
0
)
for
(
;
x
<=
width
-
v_float32
::
nlanes
;
x
+=
v_float32
::
nlanes
)
v_store
(
dst
+
x
,
v_muladd
(
vx_load_aligned
(
S0
+
x
),
b0
,
vx_load_aligned
(
S1
+
x
)
*
b1
));
else
for
(
;
x
<=
width
-
v_float32
::
nlanes
;
x
+=
v_float32
::
nlanes
)
v_store
(
dst
+
x
,
v_muladd
(
vx_load
(
S0
+
x
),
b0
,
vx_load
(
S1
+
x
)
*
b1
));
return
x
;
}
};
#elif CV_NEON
struct
VResizeLinearVec_32s8u
struct
VResizeCubicVec_32s8u
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
dst
,
const
uchar
*
_beta
,
int
width
)
const
{
const
int
**
src
=
(
const
int
**
)
_src
,
*
S0
=
src
[
0
],
*
S1
=
src
[
1
]
;
const
int
**
src
=
(
const
int
**
)
_src
;
const
short
*
beta
=
(
const
short
*
)
_beta
;
const
int
*
S0
=
src
[
0
],
*
S1
=
src
[
1
],
*
S2
=
src
[
2
],
*
S3
=
src
[
3
];
int
x
=
0
;
int16x8_t
v_b0
=
vdupq_n_s16
(
beta
[
0
]),
v_b1
=
vdupq_n_s16
(
beta
[
1
]),
v_delta
=
vdupq_n_s16
(
2
);
for
(
;
x
<=
width
-
16
;
x
+=
16
)
{
int32x4_t
v_src00
=
vshrq_n_s32
(
vld1q_s32
(
S0
+
x
),
4
),
v_src10
=
vshrq_n_s32
(
vld1q_s32
(
S1
+
x
),
4
);
int32x4_t
v_src01
=
vshrq_n_s32
(
vld1q_s32
(
S0
+
x
+
4
),
4
),
v_src11
=
vshrq_n_s32
(
vld1q_s32
(
S1
+
x
+
4
),
4
);
int16x8_t
v_src0
=
vcombine_s16
(
vmovn_s32
(
v_src00
),
vmovn_s32
(
v_src01
));
int16x8_t
v_src1
=
vcombine_s16
(
vmovn_s32
(
v_src10
),
vmovn_s32
(
v_src11
));
int16x8_t
v_dst0
=
vaddq_s16
(
vshrq_n_s16
(
vqdmulhq_s16
(
v_src0
,
v_b0
),
1
),
vshrq_n_s16
(
vqdmulhq_s16
(
v_src1
,
v_b1
),
1
));
v_dst0
=
vshrq_n_s16
(
vaddq_s16
(
v_dst0
,
v_delta
),
2
);
v_src00
=
vshrq_n_s32
(
vld1q_s32
(
S0
+
x
+
8
),
4
);
v_src10
=
vshrq_n_s32
(
vld1q_s32
(
S1
+
x
+
8
),
4
);
v_src01
=
vshrq_n_s32
(
vld1q_s32
(
S0
+
x
+
12
),
4
);
v_src11
=
vshrq_n_s32
(
vld1q_s32
(
S1
+
x
+
12
),
4
);
v_src0
=
vcombine_s16
(
vmovn_s32
(
v_src00
),
vmovn_s32
(
v_src01
));
v_src1
=
vcombine_s16
(
vmovn_s32
(
v_src10
),
vmovn_s32
(
v_src11
));
int16x8_t
v_dst1
=
vaddq_s16
(
vshrq_n_s16
(
vqdmulhq_s16
(
v_src0
,
v_b0
),
1
),
vshrq_n_s16
(
vqdmulhq_s16
(
v_src1
,
v_b1
),
1
));
v_dst1
=
vshrq_n_s16
(
vaddq_s16
(
v_dst1
,
v_delta
),
2
);
vst1q_u8
(
dst
+
x
,
vcombine_u8
(
vqmovun_s16
(
v_dst0
),
vqmovun_s16
(
v_dst1
)));
}
float
scale
=
1.
f
/
(
INTER_RESIZE_COEF_SCALE
*
INTER_RESIZE_COEF_SCALE
);
v_float32
b0
=
vx_setall_f32
(
beta
[
0
]
*
scale
),
b1
=
vx_setall_f32
(
beta
[
1
]
*
scale
),
b2
=
vx_setall_f32
(
beta
[
2
]
*
scale
),
b3
=
vx_setall_f32
(
beta
[
3
]
*
scale
);
if
(
(((
size_t
)
S0
|
(
size_t
)
S1
|
(
size_t
)
S2
|
(
size_t
)
S3
)
&
(
CV_SIMD_WIDTH
-
1
))
==
0
)
for
(
;
x
<=
width
-
v_int16
::
nlanes
;
x
+=
v_int16
::
nlanes
)
v_pack_u_store
(
dst
+
x
,
v_pack
(
v_round
(
v_muladd
(
v_cvt_f32
(
vx_load_aligned
(
S0
+
x
)),
b0
,
v_muladd
(
v_cvt_f32
(
vx_load_aligned
(
S1
+
x
)),
b1
,
v_muladd
(
v_cvt_f32
(
vx_load_aligned
(
S2
+
x
)),
b2
,
v_cvt_f32
(
vx_load_aligned
(
S3
+
x
))
*
b3
)))),
v_round
(
v_muladd
(
v_cvt_f32
(
vx_load_aligned
(
S0
+
x
+
v_float32
::
nlanes
)),
b0
,
v_muladd
(
v_cvt_f32
(
vx_load_aligned
(
S1
+
x
+
v_float32
::
nlanes
)),
b1
,
v_muladd
(
v_cvt_f32
(
vx_load_aligned
(
S2
+
x
+
v_float32
::
nlanes
)),
b2
,
v_cvt_f32
(
vx_load_aligned
(
S3
+
x
+
v_float32
::
nlanes
))
*
b3
))))));
else
for
(
;
x
<=
width
-
v_int16
::
nlanes
;
x
+=
v_int16
::
nlanes
)
v_pack_u_store
(
dst
+
x
,
v_pack
(
v_round
(
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
+
x
)),
b0
,
v_muladd
(
v_cvt_f32
(
vx_load
(
S1
+
x
)),
b1
,
v_muladd
(
v_cvt_f32
(
vx_load
(
S2
+
x
)),
b2
,
v_cvt_f32
(
vx_load
(
S3
+
x
))
*
b3
)))),
v_round
(
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
+
x
+
v_float32
::
nlanes
)),
b0
,
v_muladd
(
v_cvt_f32
(
vx_load
(
S1
+
x
+
v_float32
::
nlanes
)),
b1
,
v_muladd
(
v_cvt_f32
(
vx_load
(
S2
+
x
+
v_float32
::
nlanes
)),
b2
,
v_cvt_f32
(
vx_load
(
S3
+
x
+
v_float32
::
nlanes
))
*
b3
))))));
return
x
;
}
};
struct
VResize
Linear
Vec_32f16u
struct
VResize
Cubic
Vec_32f16u
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
const
{
const
float
**
src
=
(
const
float
**
)
_src
;
const
float
*
beta
=
(
const
float
*
)
_beta
;
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
];
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
]
,
*
S2
=
src
[
2
],
*
S3
=
src
[
3
]
;
ushort
*
dst
=
(
ushort
*
)
_dst
;
int
x
=
0
;
float32x4_t
v_b0
=
vdupq_n_f32
(
beta
[
0
]),
v_b1
=
vdupq_n_f32
(
beta
[
1
]);
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
float32x4_t
v_src00
=
vld1q_f32
(
S0
+
x
),
v_src01
=
vld1q_f32
(
S0
+
x
+
4
);
float32x4_t
v_src10
=
vld1q_f32
(
S1
+
x
),
v_src11
=
vld1q_f32
(
S1
+
x
+
4
);
float32x4_t
v_dst0
=
vmlaq_f32
(
vmulq_f32
(
v_src00
,
v_b0
),
v_src10
,
v_b1
);
float32x4_t
v_dst1
=
vmlaq_f32
(
vmulq_f32
(
v_src01
,
v_b0
),
v_src11
,
v_b1
);
vst1q_u16
(
dst
+
x
,
vcombine_u16
(
vqmovn_u32
(
cv_vrndq_u32_f32
(
v_dst0
)),
vqmovn_u32
(
cv_vrndq_u32_f32
(
v_dst1
))));
}
v_float32
b0
=
vx_setall_f32
(
beta
[
0
]),
b1
=
vx_setall_f32
(
beta
[
1
]),
b2
=
vx_setall_f32
(
beta
[
2
]),
b3
=
vx_setall_f32
(
beta
[
3
]);
for
(;
x
<=
width
-
v_uint16
::
nlanes
;
x
+=
v_uint16
::
nlanes
)
v_store
(
dst
+
x
,
v_pack_u
(
v_round
(
v_muladd
(
vx_load
(
S0
+
x
),
b0
,
v_muladd
(
vx_load
(
S1
+
x
),
b1
,
v_muladd
(
vx_load
(
S2
+
x
),
b2
,
vx_load
(
S3
+
x
)
*
b3
)))),
v_round
(
v_muladd
(
vx_load
(
S0
+
x
+
v_float32
::
nlanes
),
b0
,
v_muladd
(
vx_load
(
S1
+
x
+
v_float32
::
nlanes
),
b1
,
v_muladd
(
vx_load
(
S2
+
x
+
v_float32
::
nlanes
),
b2
,
vx_load
(
S3
+
x
+
v_float32
::
nlanes
)
*
b3
))))));
return
x
;
}
};
struct
VResize
Linear
Vec_32f16s
struct
VResize
Cubic
Vec_32f16s
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
const
{
const
float
**
src
=
(
const
float
**
)
_src
;
const
float
*
beta
=
(
const
float
*
)
_beta
;
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
];
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
]
,
*
S2
=
src
[
2
],
*
S3
=
src
[
3
]
;
short
*
dst
=
(
short
*
)
_dst
;
int
x
=
0
;
float32x4_t
v_b0
=
vdupq_n_f32
(
beta
[
0
]),
v_b1
=
vdupq_n_f32
(
beta
[
1
]);
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
float32x4_t
v_src00
=
vld1q_f32
(
S0
+
x
),
v_src01
=
vld1q_f32
(
S0
+
x
+
4
);
float32x4_t
v_src10
=
vld1q_f32
(
S1
+
x
),
v_src11
=
vld1q_f32
(
S1
+
x
+
4
);
float32x4_t
v_dst0
=
vmlaq_f32
(
vmulq_f32
(
v_src00
,
v_b0
),
v_src10
,
v_b1
);
float32x4_t
v_dst1
=
vmlaq_f32
(
vmulq_f32
(
v_src01
,
v_b0
),
v_src11
,
v_b1
);
vst1q_s16
(
dst
+
x
,
vcombine_s16
(
vqmovn_s32
(
cv_vrndq_s32_f32
(
v_dst0
)),
vqmovn_s32
(
cv_vrndq_s32_f32
(
v_dst1
))));
}
v_float32
b0
=
vx_setall_f32
(
beta
[
0
]),
b1
=
vx_setall_f32
(
beta
[
1
]),
b2
=
vx_setall_f32
(
beta
[
2
]),
b3
=
vx_setall_f32
(
beta
[
3
]);
for
(;
x
<=
width
-
v_int16
::
nlanes
;
x
+=
v_int16
::
nlanes
)
v_store
(
dst
+
x
,
v_pack
(
v_round
(
v_muladd
(
vx_load
(
S0
+
x
),
b0
,
v_muladd
(
vx_load
(
S1
+
x
),
b1
,
v_muladd
(
vx_load
(
S2
+
x
),
b2
,
vx_load
(
S3
+
x
)
*
b3
)))),
v_round
(
v_muladd
(
vx_load
(
S0
+
x
+
v_float32
::
nlanes
),
b0
,
v_muladd
(
vx_load
(
S1
+
x
+
v_float32
::
nlanes
),
b1
,
v_muladd
(
vx_load
(
S2
+
x
+
v_float32
::
nlanes
),
b2
,
vx_load
(
S3
+
x
+
v_float32
::
nlanes
)
*
b3
))))));
return
x
;
}
};
struct
VResize
Linear
Vec_32f
struct
VResize
Cubic
Vec_32f
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
const
{
const
float
**
src
=
(
const
float
**
)
_src
;
const
float
*
beta
=
(
const
float
*
)
_beta
;
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
];
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
]
,
*
S2
=
src
[
2
],
*
S3
=
src
[
3
]
;
float
*
dst
=
(
float
*
)
_dst
;
int
x
=
0
;
v_float32
b0
=
vx_setall_f32
(
beta
[
0
]),
b1
=
vx_setall_f32
(
beta
[
1
]),
b2
=
vx_setall_f32
(
beta
[
2
]),
b3
=
vx_setall_f32
(
beta
[
3
]);
float32x4_t
v_b0
=
vdupq_n_f32
(
beta
[
0
]),
v_b1
=
vdupq_n_f32
(
beta
[
1
]);
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
float32x4_t
v_src00
=
vld1q_f32
(
S0
+
x
),
v_src01
=
vld1q_f32
(
S0
+
x
+
4
);
float32x4_t
v_src10
=
vld1q_f32
(
S1
+
x
),
v_src11
=
vld1q_f32
(
S1
+
x
+
4
);
vst1q_f32
(
dst
+
x
,
vmlaq_f32
(
vmulq_f32
(
v_src00
,
v_b0
),
v_src10
,
v_b1
));
vst1q_f32
(
dst
+
x
+
4
,
vmlaq_f32
(
vmulq_f32
(
v_src01
,
v_b0
),
v_src11
,
v_b1
));
}
for
(
;
x
<=
width
-
v_float32
::
nlanes
;
x
+=
v_float32
::
nlanes
)
v_store
(
dst
+
x
,
v_muladd
(
vx_load
(
S0
+
x
),
b0
,
v_muladd
(
vx_load
(
S1
+
x
),
b1
,
v_muladd
(
vx_load
(
S2
+
x
),
b2
,
vx_load
(
S3
+
x
)
*
b3
))));
return
x
;
}
};
typedef
VResizeNoVec
VResizeCubicVec_32s8u
;
struct
VResizeCubicVec_32f16u
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
const
{
const
float
**
src
=
(
const
float
**
)
_src
;
const
float
*
beta
=
(
const
float
*
)
_beta
;
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
],
*
S2
=
src
[
2
],
*
S3
=
src
[
3
];
ushort
*
dst
=
(
ushort
*
)
_dst
;
int
x
=
0
;
float32x4_t
v_b0
=
vdupq_n_f32
(
beta
[
0
]),
v_b1
=
vdupq_n_f32
(
beta
[
1
]),
v_b2
=
vdupq_n_f32
(
beta
[
2
]),
v_b3
=
vdupq_n_f32
(
beta
[
3
]);
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
float32x4_t
v_dst0
=
vmlaq_f32
(
vmlaq_f32
(
vmlaq_f32
(
vmulq_f32
(
v_b0
,
vld1q_f32
(
S0
+
x
)),
v_b1
,
vld1q_f32
(
S1
+
x
)),
v_b2
,
vld1q_f32
(
S2
+
x
)),
v_b3
,
vld1q_f32
(
S3
+
x
));
float32x4_t
v_dst1
=
vmlaq_f32
(
vmlaq_f32
(
vmlaq_f32
(
vmulq_f32
(
v_b0
,
vld1q_f32
(
S0
+
x
+
4
)),
v_b1
,
vld1q_f32
(
S1
+
x
+
4
)),
v_b2
,
vld1q_f32
(
S2
+
x
+
4
)),
v_b3
,
vld1q_f32
(
S3
+
x
+
4
));
vst1q_u16
(
dst
+
x
,
vcombine_u16
(
vqmovn_u32
(
cv_vrndq_u32_f32
(
v_dst0
)),
vqmovn_u32
(
cv_vrndq_u32_f32
(
v_dst1
))));
}
return
x
;
}
};
#if CV_TRY_SSE4_1
struct
VResize
CubicVec_32f16s
struct
VResize
Lanczos4Vec_32f16u
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
const
{
const
float
**
src
=
(
const
float
**
)
_src
;
const
float
*
beta
=
(
const
float
*
)
_beta
;
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
],
*
S2
=
src
[
2
],
*
S3
=
src
[
3
];
short
*
dst
=
(
short
*
)
_dst
;
int
x
=
0
;
float32x4_t
v_b0
=
vdupq_n_f32
(
beta
[
0
]),
v_b1
=
vdupq_n_f32
(
beta
[
1
]),
v_b2
=
vdupq_n_f32
(
beta
[
2
]),
v_b3
=
vdupq_n_f32
(
beta
[
3
]);
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
float32x4_t
v_dst0
=
vmlaq_f32
(
vmlaq_f32
(
vmlaq_f32
(
vmulq_f32
(
v_b0
,
vld1q_f32
(
S0
+
x
)),
v_b1
,
vld1q_f32
(
S1
+
x
)),
v_b2
,
vld1q_f32
(
S2
+
x
)),
v_b3
,
vld1q_f32
(
S3
+
x
));
float32x4_t
v_dst1
=
vmlaq_f32
(
vmlaq_f32
(
vmlaq_f32
(
vmulq_f32
(
v_b0
,
vld1q_f32
(
S0
+
x
+
4
)),
v_b1
,
vld1q_f32
(
S1
+
x
+
4
)),
v_b2
,
vld1q_f32
(
S2
+
x
+
4
)),
v_b3
,
vld1q_f32
(
S3
+
x
+
4
));
vst1q_s16
(
dst
+
x
,
vcombine_s16
(
vqmovn_s32
(
cv_vrndq_s32_f32
(
v_dst0
)),
vqmovn_s32
(
cv_vrndq_s32_f32
(
v_dst1
))));
}
return
x
;
if
(
CV_CPU_HAS_SUPPORT_SSE4_1
)
return
opt_SSE4_1
::
VResizeLanczos4Vec_32f16u_SSE41
(
_src
,
_dst
,
_beta
,
width
);
else
return
0
;
}
};
struct
VResizeCubicVec_32f
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
const
{
const
float
**
src
=
(
const
float
**
)
_src
;
const
float
*
beta
=
(
const
float
*
)
_beta
;
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
],
*
S2
=
src
[
2
],
*
S3
=
src
[
3
];
float
*
dst
=
(
float
*
)
_dst
;
int
x
=
0
;
float32x4_t
v_b0
=
vdupq_n_f32
(
beta
[
0
]),
v_b1
=
vdupq_n_f32
(
beta
[
1
]),
v_b2
=
vdupq_n_f32
(
beta
[
2
]),
v_b3
=
vdupq_n_f32
(
beta
[
3
]);
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
vst1q_f32
(
dst
+
x
,
vmlaq_f32
(
vmlaq_f32
(
vmlaq_f32
(
vmulq_f32
(
v_b0
,
vld1q_f32
(
S0
+
x
)),
v_b1
,
vld1q_f32
(
S1
+
x
)),
v_b2
,
vld1q_f32
(
S2
+
x
)),
v_b3
,
vld1q_f32
(
S3
+
x
)));
vst1q_f32
(
dst
+
x
+
4
,
vmlaq_f32
(
vmlaq_f32
(
vmlaq_f32
(
vmulq_f32
(
v_b0
,
vld1q_f32
(
S0
+
x
+
4
)),
v_b1
,
vld1q_f32
(
S1
+
x
+
4
)),
v_b2
,
vld1q_f32
(
S2
+
x
+
4
)),
v_b3
,
vld1q_f32
(
S3
+
x
+
4
)));
}
return
x
;
}
};
#else
struct
VResizeLanczos4Vec_32f16u
{
...
...
@@ -1946,41 +1433,35 @@ struct VResizeLanczos4Vec_32f16u
*
S4
=
src
[
4
],
*
S5
=
src
[
5
],
*
S6
=
src
[
6
],
*
S7
=
src
[
7
];
ushort
*
dst
=
(
ushort
*
)
_dst
;
int
x
=
0
;
float32x4_t
v_b0
=
vdupq_n_f32
(
beta
[
0
]),
v_b1
=
vdupq_n_f32
(
beta
[
1
]),
v_b2
=
vdupq_n_f32
(
beta
[
2
]),
v_b3
=
vdupq_n_f32
(
beta
[
3
]),
v_b4
=
vdupq_n_f32
(
beta
[
4
]),
v_b5
=
vdupq_n_f32
(
beta
[
5
]),
v_b6
=
vdupq_n_f32
(
beta
[
6
]),
v_b7
=
vdupq_n_f32
(
beta
[
7
]);
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
float32x4_t
v_dst0
=
vmlaq_f32
(
vmlaq_f32
(
vmlaq_f32
(
vmulq_f32
(
v_b0
,
vld1q_f32
(
S0
+
x
)),
v_b1
,
vld1q_f32
(
S1
+
x
)),
v_b2
,
vld1q_f32
(
S2
+
x
)),
v_b3
,
vld1q_f32
(
S3
+
x
));
float32x4_t
v_dst1
=
vmlaq_f32
(
vmlaq_f32
(
vmlaq_f32
(
vmulq_f32
(
v_b4
,
vld1q_f32
(
S4
+
x
)),
v_b5
,
vld1q_f32
(
S5
+
x
)),
v_b6
,
vld1q_f32
(
S6
+
x
)),
v_b7
,
vld1q_f32
(
S7
+
x
));
float32x4_t
v_dst
=
vaddq_f32
(
v_dst0
,
v_dst1
);
v_dst0
=
vmlaq_f32
(
vmlaq_f32
(
vmlaq_f32
(
vmulq_f32
(
v_b0
,
vld1q_f32
(
S0
+
x
+
4
)),
v_b1
,
vld1q_f32
(
S1
+
x
+
4
)),
v_b2
,
vld1q_f32
(
S2
+
x
+
4
)),
v_b3
,
vld1q_f32
(
S3
+
x
+
4
));
v_dst1
=
vmlaq_f32
(
vmlaq_f32
(
vmlaq_f32
(
vmulq_f32
(
v_b4
,
vld1q_f32
(
S4
+
x
+
4
)),
v_b5
,
vld1q_f32
(
S5
+
x
+
4
)),
v_b6
,
vld1q_f32
(
S6
+
x
+
4
)),
v_b7
,
vld1q_f32
(
S7
+
x
+
4
));
v_dst1
=
vaddq_f32
(
v_dst0
,
v_dst1
);
vst1q_u16
(
dst
+
x
,
vcombine_u16
(
vqmovn_u32
(
cv_vrndq_u32_f32
(
v_dst
)),
vqmovn_u32
(
cv_vrndq_u32_f32
(
v_dst1
))));
}
v_float32
b0
=
vx_setall_f32
(
beta
[
0
]),
b1
=
vx_setall_f32
(
beta
[
1
]),
b2
=
vx_setall_f32
(
beta
[
2
]),
b3
=
vx_setall_f32
(
beta
[
3
]),
b4
=
vx_setall_f32
(
beta
[
4
]),
b5
=
vx_setall_f32
(
beta
[
5
]),
b6
=
vx_setall_f32
(
beta
[
6
]),
b7
=
vx_setall_f32
(
beta
[
7
]);
for
(
;
x
<=
width
-
v_uint16
::
nlanes
;
x
+=
v_uint16
::
nlanes
)
v_store
(
dst
+
x
,
v_pack_u
(
v_round
(
v_muladd
(
vx_load
(
S0
+
x
),
b0
,
v_muladd
(
vx_load
(
S1
+
x
),
b1
,
v_muladd
(
vx_load
(
S2
+
x
),
b2
,
v_muladd
(
vx_load
(
S3
+
x
),
b3
,
v_muladd
(
vx_load
(
S4
+
x
),
b4
,
v_muladd
(
vx_load
(
S5
+
x
),
b5
,
v_muladd
(
vx_load
(
S6
+
x
),
b6
,
vx_load
(
S7
+
x
)
*
b7
)))))))),
v_round
(
v_muladd
(
vx_load
(
S0
+
x
+
v_float32
::
nlanes
),
b0
,
v_muladd
(
vx_load
(
S1
+
x
+
v_float32
::
nlanes
),
b1
,
v_muladd
(
vx_load
(
S2
+
x
+
v_float32
::
nlanes
),
b2
,
v_muladd
(
vx_load
(
S3
+
x
+
v_float32
::
nlanes
),
b3
,
v_muladd
(
vx_load
(
S4
+
x
+
v_float32
::
nlanes
),
b4
,
v_muladd
(
vx_load
(
S5
+
x
+
v_float32
::
nlanes
),
b5
,
v_muladd
(
vx_load
(
S6
+
x
+
v_float32
::
nlanes
),
b6
,
vx_load
(
S7
+
x
+
v_float32
::
nlanes
)
*
b7
))))))))));
return
x
;
}
};
#endif
struct
VResizeLanczos4Vec_32f16s
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
const
...
...
@@ -1991,36 +1472,28 @@ struct VResizeLanczos4Vec_32f16s
*
S4
=
src
[
4
],
*
S5
=
src
[
5
],
*
S6
=
src
[
6
],
*
S7
=
src
[
7
];
short
*
dst
=
(
short
*
)
_dst
;
int
x
=
0
;
float32x4_t
v_b0
=
vdupq_n_f32
(
beta
[
0
]),
v_b1
=
vdupq_n_f32
(
beta
[
1
]),
v_b2
=
vdupq_n_f32
(
beta
[
2
]),
v_b3
=
vdupq_n_f32
(
beta
[
3
]),
v_b4
=
vdupq_n_f32
(
beta
[
4
]),
v_b5
=
vdupq_n_f32
(
beta
[
5
]),
v_b6
=
vdupq_n_f32
(
beta
[
6
]),
v_b7
=
vdupq_n_f32
(
beta
[
7
]);
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
float32x4_t
v_dst0
=
vmlaq_f32
(
vmlaq_f32
(
vmlaq_f32
(
vmulq_f32
(
v_b0
,
vld1q_f32
(
S0
+
x
)),
v_b1
,
vld1q_f32
(
S1
+
x
)),
v_b2
,
vld1q_f32
(
S2
+
x
)),
v_b3
,
vld1q_f32
(
S3
+
x
));
float32x4_t
v_dst1
=
vmlaq_f32
(
vmlaq_f32
(
vmlaq_f32
(
vmulq_f32
(
v_b4
,
vld1q_f32
(
S4
+
x
)),
v_b5
,
vld1q_f32
(
S5
+
x
)),
v_b6
,
vld1q_f32
(
S6
+
x
)),
v_b7
,
vld1q_f32
(
S7
+
x
));
float32x4_t
v_dst
=
vaddq_f32
(
v_dst0
,
v_dst1
);
v_dst0
=
vmlaq_f32
(
vmlaq_f32
(
vmlaq_f32
(
vmulq_f32
(
v_b0
,
vld1q_f32
(
S0
+
x
+
4
)),
v_b1
,
vld1q_f32
(
S1
+
x
+
4
)),
v_b2
,
vld1q_f32
(
S2
+
x
+
4
)),
v_b3
,
vld1q_f32
(
S3
+
x
+
4
));
v_dst1
=
vmlaq_f32
(
vmlaq_f32
(
vmlaq_f32
(
vmulq_f32
(
v_b4
,
vld1q_f32
(
S4
+
x
+
4
)),
v_b5
,
vld1q_f32
(
S5
+
x
+
4
)),
v_b6
,
vld1q_f32
(
S6
+
x
+
4
)),
v_b7
,
vld1q_f32
(
S7
+
x
+
4
));
v_dst1
=
vaddq_f32
(
v_dst0
,
v_dst1
);
vst1q_s16
(
dst
+
x
,
vcombine_s16
(
vqmovn_s32
(
cv_vrndq_s32_f32
(
v_dst
)),
vqmovn_s32
(
cv_vrndq_s32_f32
(
v_dst1
))));
}
v_float32
b0
=
vx_setall_f32
(
beta
[
0
]),
b1
=
vx_setall_f32
(
beta
[
1
]),
b2
=
vx_setall_f32
(
beta
[
2
]),
b3
=
vx_setall_f32
(
beta
[
3
]),
b4
=
vx_setall_f32
(
beta
[
4
]),
b5
=
vx_setall_f32
(
beta
[
5
]),
b6
=
vx_setall_f32
(
beta
[
6
]),
b7
=
vx_setall_f32
(
beta
[
7
]);
for
(
;
x
<=
width
-
v_int16
::
nlanes
;
x
+=
v_int16
::
nlanes
)
v_store
(
dst
+
x
,
v_pack
(
v_round
(
v_muladd
(
vx_load
(
S0
+
x
),
b0
,
v_muladd
(
vx_load
(
S1
+
x
),
b1
,
v_muladd
(
vx_load
(
S2
+
x
),
b2
,
v_muladd
(
vx_load
(
S3
+
x
),
b3
,
v_muladd
(
vx_load
(
S4
+
x
),
b4
,
v_muladd
(
vx_load
(
S5
+
x
),
b5
,
v_muladd
(
vx_load
(
S6
+
x
),
b6
,
vx_load
(
S7
+
x
)
*
b7
)))))))),
v_round
(
v_muladd
(
vx_load
(
S0
+
x
+
v_float32
::
nlanes
),
b0
,
v_muladd
(
vx_load
(
S1
+
x
+
v_float32
::
nlanes
),
b1
,
v_muladd
(
vx_load
(
S2
+
x
+
v_float32
::
nlanes
),
b2
,
v_muladd
(
vx_load
(
S3
+
x
+
v_float32
::
nlanes
),
b3
,
v_muladd
(
vx_load
(
S4
+
x
+
v_float32
::
nlanes
),
b4
,
v_muladd
(
vx_load
(
S5
+
x
+
v_float32
::
nlanes
),
b5
,
v_muladd
(
vx_load
(
S6
+
x
+
v_float32
::
nlanes
),
b6
,
vx_load
(
S7
+
x
+
v_float32
::
nlanes
)
*
b7
))))))))));
return
x
;
}
...
...
@@ -2036,23 +1509,21 @@ struct VResizeLanczos4Vec_32f
*
S4
=
src
[
4
],
*
S5
=
src
[
5
],
*
S6
=
src
[
6
],
*
S7
=
src
[
7
];
float
*
dst
=
(
float
*
)
_dst
;
int
x
=
0
;
float32x4_t
v_b0
=
vdupq_n_f32
(
beta
[
0
]),
v_b1
=
vdupq_n_f32
(
beta
[
1
]),
v_b2
=
vdupq_n_f32
(
beta
[
2
]),
v_b3
=
vdupq_n_f32
(
beta
[
3
]),
v_b4
=
vdupq_n_f32
(
beta
[
4
]),
v_b5
=
vdupq_n_f32
(
beta
[
5
]),
v_b6
=
vdupq_n_f32
(
beta
[
6
]),
v_b7
=
vdupq_n_f32
(
beta
[
7
]);
for
(
;
x
<=
width
-
4
;
x
+=
4
)
{
float32x4_t
v_dst0
=
vmlaq_f32
(
vmlaq_f32
(
vmlaq_f32
(
vmulq_f32
(
v_b0
,
vld1q_f32
(
S0
+
x
)),
v_b1
,
vld1q_f32
(
S1
+
x
)),
v_b2
,
vld1q_f32
(
S2
+
x
)),
v_b3
,
vld1q_f32
(
S3
+
x
));
float32x4_t
v_dst1
=
vmlaq_f32
(
vmlaq_f32
(
vmlaq_f32
(
vmulq_f32
(
v_b4
,
vld1q_f32
(
S4
+
x
)),
v_b5
,
vld1q_f32
(
S5
+
x
)),
v_b6
,
vld1q_f32
(
S6
+
x
)),
v_b7
,
vld1q_f32
(
S7
+
x
));
vst1q_f32
(
dst
+
x
,
vaddq_f32
(
v_dst0
,
v_dst1
));
}
v_float32
b0
=
vx_setall_f32
(
beta
[
0
]),
b1
=
vx_setall_f32
(
beta
[
1
]),
b2
=
vx_setall_f32
(
beta
[
2
]),
b3
=
vx_setall_f32
(
beta
[
3
]),
b4
=
vx_setall_f32
(
beta
[
4
]),
b5
=
vx_setall_f32
(
beta
[
5
]),
b6
=
vx_setall_f32
(
beta
[
6
]),
b7
=
vx_setall_f32
(
beta
[
7
]);
for
(
;
x
<=
width
-
v_float32
::
nlanes
;
x
+=
v_float32
::
nlanes
)
v_store
(
dst
+
x
,
v_muladd
(
vx_load
(
S0
+
x
),
b0
,
v_muladd
(
vx_load
(
S1
+
x
),
b1
,
v_muladd
(
vx_load
(
S2
+
x
),
b2
,
v_muladd
(
vx_load
(
S3
+
x
),
b3
,
v_muladd
(
vx_load
(
S4
+
x
),
b4
,
v_muladd
(
vx_load
(
S5
+
x
),
b5
,
v_muladd
(
vx_load
(
S6
+
x
),
b6
,
vx_load
(
S7
+
x
)
*
b7
))))))));
return
x
;
}
...
...
@@ -2695,95 +2166,94 @@ private:
int
step
;
};
#elif CV_S
SE2
#elif CV_S
IMD
class
ResizeAreaFastVec_SIMD_8u
{
public
:
ResizeAreaFastVec_SIMD_8u
(
int
_cn
,
int
_step
)
:
cn
(
_cn
),
step
(
_step
)
{
use_simd
=
checkHardwareSupport
(
CV_CPU_SSE2
);
}
cn
(
_cn
),
step
(
_step
)
{}
int
operator
()
(
const
uchar
*
S
,
uchar
*
D
,
int
w
)
const
{
if
(
!
use_simd
)
return
0
;
int
dx
=
0
;
const
uchar
*
S0
=
S
;
const
uchar
*
S1
=
S0
+
step
;
__m128i
zero
=
_mm_setzero_si128
();
__m128i
delta2
=
_mm_set1_epi16
(
2
);
if
(
cn
==
1
)
{
__m128i
masklow
=
_mm_set1_epi
16
(
0x00ff
);
for
(
;
dx
<=
w
-
8
;
dx
+=
8
,
S0
+=
16
,
S1
+=
16
,
D
+=
8
)
v_uint16
masklow
=
vx_setall_u
16
(
0x00ff
);
for
(
;
dx
<=
w
-
v_uint16
::
nlanes
;
dx
+=
v_uint16
::
nlanes
,
S0
+=
v_uint8
::
nlanes
,
S1
+=
v_uint8
::
nlanes
,
D
+=
v_uint16
::
nlanes
)
{
__m128i
r0
=
_mm_loadu_si128
((
const
__m128i
*
)
S0
);
__m128i
r1
=
_mm_loadu_si128
((
const
__m128i
*
)
S1
);
__m128i
s0
=
_mm_add_epi16
(
_mm_srli_epi16
(
r0
,
8
),
_mm_and_si128
(
r0
,
masklow
));
__m128i
s1
=
_mm_add_epi16
(
_mm_srli_epi16
(
r1
,
8
),
_mm_and_si128
(
r1
,
masklow
));
s0
=
_mm_add_epi16
(
_mm_add_epi16
(
s0
,
s1
),
delta2
);
s0
=
_mm_packus_epi16
(
_mm_srli_epi16
(
s0
,
2
),
zero
);
_mm_storel_epi64
((
__m128i
*
)
D
,
s0
);
v_uint16
r0
=
v_reinterpret_as_u16
(
vx_load
(
S0
));
v_uint16
r1
=
v_reinterpret_as_u16
(
vx_load
(
S1
));
v_rshr_pack_store
<
2
>
(
D
,
(
r0
>>
8
)
+
(
r0
&
masklow
)
+
(
r1
>>
8
)
+
(
r1
&
masklow
));
}
}
else
if
(
cn
==
3
)
for
(
;
dx
<=
w
-
11
;
dx
+=
6
,
S0
+=
12
,
S1
+=
12
,
D
+=
6
)
{
if
(
CV_SIMD_WIDTH
>
64
)
return
0
;
for
(
;
dx
<=
w
-
3
*
v_uint8
::
nlanes
;
dx
+=
3
*
v_uint8
::
nlanes
,
S0
+=
6
*
v_uint8
::
nlanes
,
S1
+=
6
*
v_uint8
::
nlanes
,
D
+=
3
*
v_uint8
::
nlanes
)
{
__m128i
r0
=
_mm_loadu_si128
((
const
__m128i
*
)
S0
);
__m128i
r1
=
_mm_loadu_si128
((
const
__m128i
*
)
S1
);
__m128i
r0_16l
=
_mm_unpacklo_epi8
(
r0
,
zero
);
__m128i
r0_16h
=
_mm_unpacklo_epi8
(
_mm_srli_si128
(
r0
,
6
),
zero
);
__m128i
r1_16l
=
_mm_unpacklo_epi8
(
r1
,
zero
);
__m128i
r1_16h
=
_mm_unpacklo_epi8
(
_mm_srli_si128
(
r1
,
6
),
zero
);
__m128i
s0
=
_mm_add_epi16
(
r0_16l
,
_mm_srli_si128
(
r0_16l
,
6
));
__m128i
s1
=
_mm_add_epi16
(
r1_16l
,
_mm_srli_si128
(
r1_16l
,
6
));
s0
=
_mm_add_epi16
(
s1
,
_mm_add_epi16
(
s0
,
delta2
));
s0
=
_mm_packus_epi16
(
_mm_srli_epi16
(
s0
,
2
),
zero
);
_mm_storel_epi64
((
__m128i
*
)
D
,
s0
);
s0
=
_mm_add_epi16
(
r0_16h
,
_mm_srli_si128
(
r0_16h
,
6
));
s1
=
_mm_add_epi16
(
r1_16h
,
_mm_srli_si128
(
r1_16h
,
6
));
s0
=
_mm_add_epi16
(
s1
,
_mm_add_epi16
(
s0
,
delta2
));
s0
=
_mm_packus_epi16
(
_mm_srli_epi16
(
s0
,
2
),
zero
);
_mm_storel_epi64
((
__m128i
*
)(
D
+
3
),
s0
);
v_uint16
t0
,
t1
,
t2
,
t3
,
t4
,
t5
;
v_uint16
s0
,
s1
,
s2
,
s3
,
s4
,
s5
;
s0
=
vx_load_expand
(
S0
)
+
vx_load_expand
(
S1
);
s1
=
vx_load_expand
(
S0
+
v_uint16
::
nlanes
)
+
vx_load_expand
(
S1
+
v_uint16
::
nlanes
);
s2
=
vx_load_expand
(
S0
+
2
*
v_uint16
::
nlanes
)
+
vx_load_expand
(
S1
+
2
*
v_uint16
::
nlanes
);
s3
=
vx_load_expand
(
S0
+
3
*
v_uint16
::
nlanes
)
+
vx_load_expand
(
S1
+
3
*
v_uint16
::
nlanes
);
s4
=
vx_load_expand
(
S0
+
4
*
v_uint16
::
nlanes
)
+
vx_load_expand
(
S1
+
4
*
v_uint16
::
nlanes
);
s5
=
vx_load_expand
(
S0
+
5
*
v_uint16
::
nlanes
)
+
vx_load_expand
(
S1
+
5
*
v_uint16
::
nlanes
);
v_zip
(
s0
,
s3
,
t0
,
t1
);
v_zip
(
s1
,
s4
,
t2
,
t3
);
v_zip
(
s2
,
s5
,
t4
,
t5
);
v_zip
(
t0
,
t3
,
s0
,
s1
);
v_zip
(
t1
,
t4
,
s2
,
s3
);
v_zip
(
t2
,
t5
,
s4
,
s5
);
v_zip
(
s0
,
s3
,
t0
,
t1
);
v_zip
(
s1
,
s4
,
t2
,
t3
);
v_zip
(
s2
,
s5
,
t4
,
t5
);
v_uint16
bl
,
gl
,
rl
;
#if CV_SIMD_WIDTH == 16
bl
=
t0
+
t3
;
gl
=
t1
+
t4
;
rl
=
t2
+
t5
;
#elif CV_SIMD_WIDTH == 32
v_zip
(
t0
,
t3
,
s0
,
s1
);
v_zip
(
t1
,
t4
,
s2
,
s3
);
v_zip
(
t2
,
t5
,
s4
,
s5
);
bl
=
s0
+
s3
;
gl
=
s1
+
s4
;
rl
=
s2
+
s5
;
#elif CV_SIMD_WIDTH == 64
v_zip
(
s0
,
s3
,
t0
,
t1
);
v_zip
(
s1
,
s4
,
t2
,
t3
);
v_zip
(
s2
,
s5
,
t4
,
t5
);
bl
=
t0
+
t3
;
gl
=
t1
+
t4
;
rl
=
t2
+
t5
;
#endif
s0
=
vx_load_expand
(
S0
+
6
*
v_uint16
::
nlanes
)
+
vx_load_expand
(
S1
+
6
*
v_uint16
::
nlanes
);
s1
=
vx_load_expand
(
S0
+
7
*
v_uint16
::
nlanes
)
+
vx_load_expand
(
S1
+
7
*
v_uint16
::
nlanes
);
s2
=
vx_load_expand
(
S0
+
8
*
v_uint16
::
nlanes
)
+
vx_load_expand
(
S1
+
8
*
v_uint16
::
nlanes
);
s3
=
vx_load_expand
(
S0
+
9
*
v_uint16
::
nlanes
)
+
vx_load_expand
(
S1
+
9
*
v_uint16
::
nlanes
);
s4
=
vx_load_expand
(
S0
+
10
*
v_uint16
::
nlanes
)
+
vx_load_expand
(
S1
+
10
*
v_uint16
::
nlanes
);
s5
=
vx_load_expand
(
S0
+
11
*
v_uint16
::
nlanes
)
+
vx_load_expand
(
S1
+
11
*
v_uint16
::
nlanes
);
v_zip
(
s0
,
s3
,
t0
,
t1
);
v_zip
(
s1
,
s4
,
t2
,
t3
);
v_zip
(
s2
,
s5
,
t4
,
t5
);
v_zip
(
t0
,
t3
,
s0
,
s1
);
v_zip
(
t1
,
t4
,
s2
,
s3
);
v_zip
(
t2
,
t5
,
s4
,
s5
);
v_zip
(
s0
,
s3
,
t0
,
t1
);
v_zip
(
s1
,
s4
,
t2
,
t3
);
v_zip
(
s2
,
s5
,
t4
,
t5
);
v_uint16
bh
,
gh
,
rh
;
#if CV_SIMD_WIDTH == 16
bh
=
t0
+
t3
;
gh
=
t1
+
t4
;
rh
=
t2
+
t5
;
#elif CV_SIMD_WIDTH == 32
v_zip
(
t0
,
t3
,
s0
,
s1
);
v_zip
(
t1
,
t4
,
s2
,
s3
);
v_zip
(
t2
,
t5
,
s4
,
s5
);
bh
=
s0
+
s3
;
gh
=
s1
+
s4
;
rh
=
s2
+
s5
;
#elif CV_SIMD_WIDTH == 64
v_zip
(
s0
,
s3
,
t0
,
t1
);
v_zip
(
s1
,
s4
,
t2
,
t3
);
v_zip
(
s2
,
s5
,
t4
,
t5
);
bh
=
t0
+
t3
;
gh
=
t1
+
t4
;
rh
=
t2
+
t5
;
#endif
v_store_interleave
(
D
,
v_rshr_pack
<
2
>
(
bl
,
bh
),
v_rshr_pack
<
2
>
(
gl
,
gh
),
v_rshr_pack
<
2
>
(
rl
,
rh
));
}
}
else
{
CV_Assert
(
cn
==
4
);
int
v
[]
=
{
0
,
0
,
-
1
,
-
1
};
__m128i
mask
=
_mm_loadu_si128
((
const
__m128i
*
)
v
);
for
(
;
dx
<=
w
-
8
;
dx
+=
8
,
S0
+=
16
,
S1
+=
16
,
D
+=
8
)
for
(
;
dx
<=
w
-
v_uint8
::
nlanes
;
dx
+=
v_uint8
::
nlanes
,
S0
+=
2
*
v_uint8
::
nlanes
,
S1
+=
2
*
v_uint8
::
nlanes
,
D
+=
v_uint8
::
nlanes
)
{
__m128i
r0
=
_mm_loadu_si128
((
const
__m128i
*
)
S0
);
__m128i
r1
=
_mm_loadu_si128
((
const
__m128i
*
)
S1
);
__m128i
r0_16l
=
_mm_unpacklo_epi8
(
r0
,
zero
);
__m128i
r0_16h
=
_mm_unpackhi_epi8
(
r0
,
zero
);
__m128i
r1_16l
=
_mm_unpacklo_epi8
(
r1
,
zero
);
__m128i
r1_16h
=
_mm_unpackhi_epi8
(
r1
,
zero
);
__m128i
s0
=
_mm_add_epi16
(
r0_16l
,
_mm_srli_si128
(
r0_16l
,
8
));
__m128i
s1
=
_mm_add_epi16
(
r1_16l
,
_mm_srli_si128
(
r1_16l
,
8
));
s0
=
_mm_add_epi16
(
s1
,
_mm_add_epi16
(
s0
,
delta2
));
__m128i
res0
=
_mm_srli_epi16
(
s0
,
2
);
s0
=
_mm_add_epi16
(
r0_16h
,
_mm_srli_si128
(
r0_16h
,
8
));
s1
=
_mm_add_epi16
(
r1_16h
,
_mm_srli_si128
(
r1_16h
,
8
));
s0
=
_mm_add_epi16
(
s1
,
_mm_add_epi16
(
s0
,
delta2
));
__m128i
res1
=
_mm_srli_epi16
(
s0
,
2
);
s0
=
_mm_packus_epi16
(
_mm_or_si128
(
_mm_andnot_si128
(
mask
,
res0
),
_mm_and_si128
(
mask
,
_mm_slli_si128
(
res1
,
8
))),
zero
);
_mm_storel_epi64
((
__m128i
*
)(
D
),
s0
);
v_uint32
r00
,
r01
,
r10
,
r11
;
v_load_deinterleave
((
uint32_t
*
)
S0
,
r00
,
r01
);
v_load_deinterleave
((
uint32_t
*
)
S1
,
r10
,
r11
);
v_uint16
r00l
,
r01l
,
r10l
,
r11l
,
r00h
,
r01h
,
r10h
,
r11h
;
v_expand
(
v_reinterpret_as_u8
(
r00
),
r00l
,
r00h
);
v_expand
(
v_reinterpret_as_u8
(
r01
),
r01l
,
r01h
);
v_expand
(
v_reinterpret_as_u8
(
r10
),
r10l
,
r10h
);
v_expand
(
v_reinterpret_as_u8
(
r11
),
r11l
,
r11h
);
v_store
(
D
,
v_rshr_pack
<
2
>
(
r00l
+
r01l
+
r10l
+
r11l
,
r00h
+
r01h
+
r10h
+
r11h
));
}
}
...
...
@@ -2792,7 +2262,6 @@ public:
private
:
int
cn
;
bool
use_simd
;
int
step
;
};
...
...
@@ -2800,164 +2269,258 @@ class ResizeAreaFastVec_SIMD_16u
{
public
:
ResizeAreaFastVec_SIMD_16u
(
int
_cn
,
int
_step
)
:
cn
(
_cn
),
step
(
_step
)
{
use_simd
=
checkHardwareSupport
(
CV_CPU_SSE2
);
}
cn
(
_cn
),
step
(
_step
)
{}
int
operator
()
(
const
ushort
*
S
,
ushort
*
D
,
int
w
)
const
{
if
(
!
use_simd
)
return
0
;
int
dx
=
0
;
const
ushort
*
S0
=
(
const
ushort
*
)
S
;
const
ushort
*
S1
=
(
const
ushort
*
)((
const
uchar
*
)(
S
)
+
step
);
__m128i
masklow
=
_mm_set1_epi32
(
0x0000ffff
);
__m128i
zero
=
_mm_setzero_si128
();
__m128i
delta2
=
_mm_set1_epi32
(
2
);
#define _mm_packus_epi32(a, zero) _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(a, 16), 16), zero)
if
(
cn
==
1
)
{
for
(
;
dx
<=
w
-
4
;
dx
+=
4
,
S0
+=
8
,
S1
+=
8
,
D
+=
4
)
v_uint32
masklow
=
vx_setall_u32
(
0x0000ffff
);
for
(;
dx
<=
w
-
v_uint32
::
nlanes
;
dx
+=
v_uint32
::
nlanes
,
S0
+=
v_uint16
::
nlanes
,
S1
+=
v_uint16
::
nlanes
,
D
+=
v_uint32
::
nlanes
)
{
__m128i
r0
=
_mm_loadu_si128
((
const
__m128i
*
)
S0
);
__m128i
r1
=
_mm_loadu_si128
((
const
__m128i
*
)
S1
);
__m128i
s0
=
_mm_add_epi32
(
_mm_srli_epi32
(
r0
,
16
),
_mm_and_si128
(
r0
,
masklow
));
__m128i
s1
=
_mm_add_epi32
(
_mm_srli_epi32
(
r1
,
16
),
_mm_and_si128
(
r1
,
masklow
));
s0
=
_mm_add_epi32
(
_mm_add_epi32
(
s0
,
s1
),
delta2
);
s0
=
_mm_srli_epi32
(
s0
,
2
);
s0
=
_mm_packus_epi32
(
s0
,
zero
);
_mm_storel_epi64
((
__m128i
*
)
D
,
s0
);
v_uint32
r0
=
v_reinterpret_as_u32
(
vx_load
(
S0
));
v_uint32
r1
=
v_reinterpret_as_u32
(
vx_load
(
S1
));
v_rshr_pack_store
<
2
>
(
D
,
(
r0
>>
16
)
+
(
r0
&
masklow
)
+
(
r1
>>
16
)
+
(
r1
&
masklow
));
}
}
else
if
(
cn
==
3
)
{
#if CV_SIMD_WIDTH == 16
for
(
;
dx
<=
w
-
4
;
dx
+=
3
,
S0
+=
6
,
S1
+=
6
,
D
+=
3
)
#if CV_SSE4_1
{
__m128i
r0
=
_mm_loadu_si128
((
const
__m128i
*
)
S0
);
__m128i
r1
=
_mm_loadu_si128
((
const
__m128i
*
)
S1
);
__m128i
r0_16l
=
_mm_unpacklo_epi16
(
r0
,
zero
);
__m128i
r0_16h
=
_mm_unpacklo_epi16
(
_mm_srli_si128
(
r0
,
6
),
zero
);
__m128i
r1_16l
=
_mm_unpacklo_epi16
(
r1
,
zero
);
__m128i
r1_16h
=
_mm_unpacklo_epi16
(
_mm_srli_si128
(
r1
,
6
),
zero
);
__m128i
s0
=
_mm_add_epi32
(
r0_16l
,
r0_16h
);
__m128i
s1
=
_mm_add_epi32
(
r1_16l
,
r1_16h
);
s0
=
_mm_add_epi32
(
delta2
,
_mm_add_epi32
(
s0
,
s1
));
s0
=
_mm_packus_epi32
(
_mm_srli_epi32
(
s0
,
2
),
zero
);
_mm_storel_epi64
((
__m128i
*
)
D
,
s0
);
v_uint32
r0
,
r1
,
r2
,
r3
;
v_expand
(
vx_load
(
S0
),
r0
,
r1
);
v_expand
(
vx_load
(
S1
),
r2
,
r3
);
r0
+=
r2
;
r1
+=
r3
;
v_rshr_pack_store
<
2
>
(
D
,
r0
+
v_rotate_left
<
1
>
(
r1
,
r0
));
}
#else
v_rshr_pack_store
<
2
>
(
D
,
v_load_expand
(
S0
)
+
v_load_expand
(
S0
+
3
)
+
v_load_expand
(
S1
)
+
v_load_expand
(
S1
+
3
));
#endif
#elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64
for
(
;
dx
<=
w
-
3
*
v_uint16
::
nlanes
;
dx
+=
3
*
v_uint16
::
nlanes
,
S0
+=
6
*
v_uint16
::
nlanes
,
S1
+=
6
*
v_uint16
::
nlanes
,
D
+=
3
*
v_uint16
::
nlanes
)
{
v_uint32
t0
,
t1
,
t2
,
t3
,
t4
,
t5
;
v_uint32
s0
,
s1
,
s2
,
s3
,
s4
,
s5
;
s0
=
vx_load_expand
(
S0
)
+
vx_load_expand
(
S1
);
s1
=
vx_load_expand
(
S0
+
v_uint32
::
nlanes
)
+
vx_load_expand
(
S1
+
v_uint32
::
nlanes
);
s2
=
vx_load_expand
(
S0
+
2
*
v_uint32
::
nlanes
)
+
vx_load_expand
(
S1
+
2
*
v_uint32
::
nlanes
);
s3
=
vx_load_expand
(
S0
+
3
*
v_uint32
::
nlanes
)
+
vx_load_expand
(
S1
+
3
*
v_uint32
::
nlanes
);
s4
=
vx_load_expand
(
S0
+
4
*
v_uint32
::
nlanes
)
+
vx_load_expand
(
S1
+
4
*
v_uint32
::
nlanes
);
s5
=
vx_load_expand
(
S0
+
5
*
v_uint32
::
nlanes
)
+
vx_load_expand
(
S1
+
5
*
v_uint32
::
nlanes
);
v_zip
(
s0
,
s3
,
t0
,
t1
);
v_zip
(
s1
,
s4
,
t2
,
t3
);
v_zip
(
s2
,
s5
,
t4
,
t5
);
v_zip
(
t0
,
t3
,
s0
,
s1
);
v_zip
(
t1
,
t4
,
s2
,
s3
);
v_zip
(
t2
,
t5
,
s4
,
s5
);
v_uint32
bl
,
gl
,
rl
;
v_zip
(
s0
,
s3
,
t0
,
t1
);
v_zip
(
s1
,
s4
,
t2
,
t3
);
v_zip
(
s2
,
s5
,
t4
,
t5
);
#if CV_SIMD_WIDTH == 32
bl
=
t0
+
t3
;
gl
=
t1
+
t4
;
rl
=
t2
+
t5
;
#else //CV_SIMD_WIDTH == 64
v_zip
(
t0
,
t3
,
s0
,
s1
);
v_zip
(
t1
,
t4
,
s2
,
s3
);
v_zip
(
t2
,
t5
,
s4
,
s5
);
bl
=
s0
+
s3
;
gl
=
s1
+
s4
;
rl
=
s2
+
s5
;
#endif
s0
=
vx_load_expand
(
S0
+
6
*
v_uint32
::
nlanes
)
+
vx_load_expand
(
S1
+
6
*
v_uint32
::
nlanes
);
s1
=
vx_load_expand
(
S0
+
7
*
v_uint32
::
nlanes
)
+
vx_load_expand
(
S1
+
7
*
v_uint32
::
nlanes
);
s2
=
vx_load_expand
(
S0
+
8
*
v_uint32
::
nlanes
)
+
vx_load_expand
(
S1
+
8
*
v_uint32
::
nlanes
);
s3
=
vx_load_expand
(
S0
+
9
*
v_uint32
::
nlanes
)
+
vx_load_expand
(
S1
+
9
*
v_uint32
::
nlanes
);
s4
=
vx_load_expand
(
S0
+
10
*
v_uint32
::
nlanes
)
+
vx_load_expand
(
S1
+
10
*
v_uint32
::
nlanes
);
s5
=
vx_load_expand
(
S0
+
11
*
v_uint32
::
nlanes
)
+
vx_load_expand
(
S1
+
11
*
v_uint32
::
nlanes
);
v_zip
(
s0
,
s3
,
t0
,
t1
);
v_zip
(
s1
,
s4
,
t2
,
t3
);
v_zip
(
s2
,
s5
,
t4
,
t5
);
v_zip
(
t0
,
t3
,
s0
,
s1
);
v_zip
(
t1
,
t4
,
s2
,
s3
);
v_zip
(
t2
,
t5
,
s4
,
s5
);
v_uint32
bh
,
gh
,
rh
;
v_zip
(
s0
,
s3
,
t0
,
t1
);
v_zip
(
s1
,
s4
,
t2
,
t3
);
v_zip
(
s2
,
s5
,
t4
,
t5
);
#if CV_SIMD_WIDTH == 32
bh
=
t0
+
t3
;
gh
=
t1
+
t4
;
rh
=
t2
+
t5
;
#else //CV_SIMD_WIDTH == 64
v_zip
(
t0
,
t3
,
s0
,
s1
);
v_zip
(
t1
,
t4
,
s2
,
s3
);
v_zip
(
t2
,
t5
,
s4
,
s5
);
bh
=
s0
+
s3
;
gh
=
s1
+
s4
;
rh
=
s2
+
s5
;
#endif
v_store_interleave
(
D
,
v_rshr_pack
<
2
>
(
bl
,
bh
),
v_rshr_pack
<
2
>
(
gl
,
gh
),
v_rshr_pack
<
2
>
(
rl
,
rh
));
}
#elif CV_SIMD_WIDTH >= 64
v_uint32
masklow
=
vx_setall_u32
(
0x0000ffff
);
for
(
;
dx
<=
w
-
3
*
v_uint16
::
nlanes
;
dx
+=
3
*
v_uint16
::
nlanes
,
S0
+=
6
*
v_uint16
::
nlanes
,
S1
+=
6
*
v_uint16
::
nlanes
,
D
+=
3
*
v_uint16
::
nlanes
)
{
v_uint16
b0
,
g0
,
r0
,
b1
,
g1
,
r1
;
v_load_deinterleave
(
S0
,
b0
,
g0
,
r0
);
v_load_deinterleave
(
S1
,
b1
,
g1
,
r1
);
v_uint32
bl
=
(
v_reinterpret_as_u32
(
b0
)
>>
16
)
+
(
v_reinterpret_as_u32
(
b0
)
&
masklow
)
+
(
v_reinterpret_as_u32
(
b1
)
>>
16
)
+
(
v_reinterpret_as_u32
(
b1
)
&
masklow
);
v_uint32
gl
=
(
v_reinterpret_as_u32
(
g0
)
>>
16
)
+
(
v_reinterpret_as_u32
(
g0
)
&
masklow
)
+
(
v_reinterpret_as_u32
(
g1
)
>>
16
)
+
(
v_reinterpret_as_u32
(
g1
)
&
masklow
);
v_uint32
rl
=
(
v_reinterpret_as_u32
(
r0
)
>>
16
)
+
(
v_reinterpret_as_u32
(
r0
)
&
masklow
)
+
(
v_reinterpret_as_u32
(
r1
)
>>
16
)
+
(
v_reinterpret_as_u32
(
r1
)
&
masklow
);
v_load_deinterleave
(
S0
+
3
*
v_uint16
::
nlanes
,
b0
,
g0
,
r0
);
v_load_deinterleave
(
S1
+
3
*
v_uint16
::
nlanes
,
b1
,
g1
,
r1
);
v_uint32
bh
=
(
v_reinterpret_as_u32
(
b0
)
>>
16
)
+
(
v_reinterpret_as_u32
(
b0
)
&
masklow
)
+
(
v_reinterpret_as_u32
(
b1
)
>>
16
)
+
(
v_reinterpret_as_u32
(
b1
)
&
masklow
);
v_uint32
gh
=
(
v_reinterpret_as_u32
(
g0
)
>>
16
)
+
(
v_reinterpret_as_u32
(
g0
)
&
masklow
)
+
(
v_reinterpret_as_u32
(
g1
)
>>
16
)
+
(
v_reinterpret_as_u32
(
g1
)
&
masklow
);
v_uint32
rh
=
(
v_reinterpret_as_u32
(
r0
)
>>
16
)
+
(
v_reinterpret_as_u32
(
r0
)
&
masklow
)
+
(
v_reinterpret_as_u32
(
r1
)
>>
16
)
+
(
v_reinterpret_as_u32
(
r1
)
&
masklow
);
v_store_interleave
(
D
,
v_rshr_pack
<
2
>
(
bl
,
bh
),
v_rshr_pack
<
2
>
(
gl
,
gh
),
v_rshr_pack
<
2
>
(
rl
,
rh
));
}
#endif
}
else
{
CV_Assert
(
cn
==
4
);
for
(
;
dx
<=
w
-
4
;
dx
+=
4
,
S0
+=
8
,
S1
+=
8
,
D
+=
4
)
#if CV_SIMD_WIDTH >= 64
for
(
;
dx
<=
w
-
v_uint16
::
nlanes
;
dx
+=
v_uint16
::
nlanes
,
S0
+=
2
*
v_uint16
::
nlanes
,
S1
+=
2
*
v_uint16
::
nlanes
,
D
+=
v_uint16
::
nlanes
)
{
__m128i
r0
=
_mm_loadu_si128
((
const
__m128i
*
)
S0
);
__m128i
r1
=
_mm_loadu_si128
((
const
__m128i
*
)
S1
);
__m128i
r0_32l
=
_mm_unpacklo_epi16
(
r0
,
zero
);
__m128i
r0_32h
=
_mm_unpackhi_epi16
(
r0
,
zero
);
__m128i
r1_32l
=
_mm_unpacklo_epi16
(
r1
,
zero
);
__m128i
r1_32h
=
_mm_unpackhi_epi16
(
r1
,
zero
);
__m128i
s0
=
_mm_add_epi32
(
r0_32l
,
r0_32h
);
__m128i
s1
=
_mm_add_epi32
(
r1_32l
,
r1_32h
);
s0
=
_mm_add_epi32
(
s1
,
_mm_add_epi32
(
s0
,
delta2
));
s0
=
_mm_packus_epi32
(
_mm_srli_epi32
(
s0
,
2
),
zero
);
_mm_storel_epi64
((
__m128i
*
)
D
,
s0
);
v_uint64
r00
,
r01
,
r10
,
r11
;
v_load_deinterleave
((
uint64_t
*
)
S0
,
r00
,
r01
);
v_load_deinterleave
((
uint64_t
*
)
S1
,
r10
,
r11
);
v_uint32
r00l
,
r01l
,
r10l
,
r11l
,
r00h
,
r01h
,
r10h
,
r11h
;
v_expand
(
v_reinterpret_as_u16
(
r00
),
r00l
,
r00h
);
v_expand
(
v_reinterpret_as_u16
(
r01
),
r01l
,
r01h
);
v_expand
(
v_reinterpret_as_u16
(
r10
),
r10l
,
r10h
);
v_expand
(
v_reinterpret_as_u16
(
r11
),
r11l
,
r11h
);
v_store
(
D
,
v_rshr_pack
<
2
>
(
r00l
+
r01l
+
r10l
+
r11l
,
r00h
+
r01h
+
r10h
+
r11h
));
}
#else
for
(
;
dx
<=
w
-
v_uint32
::
nlanes
;
dx
+=
v_uint32
::
nlanes
,
S0
+=
v_uint16
::
nlanes
,
S1
+=
v_uint16
::
nlanes
,
D
+=
v_uint32
::
nlanes
)
{
v_uint32
r0
,
r1
,
r2
,
r3
;
v_expand
(
vx_load
(
S0
),
r0
,
r1
);
v_expand
(
vx_load
(
S1
),
r2
,
r3
);
r0
+=
r2
;
r1
+=
r3
;
v_uint32
v_d
;
#if CV_SIMD_WIDTH == 16
v_d
=
r0
+
r1
;
#elif CV_SIMD_WIDTH == 32
v_uint32
t0
,
t1
;
v_recombine
(
r0
,
r1
,
t0
,
t1
);
v_d
=
t0
+
t1
;
#endif
v_rshr_pack_store
<
2
>
(
D
,
v_d
);
}
#endif
}
#undef _mm_packus_epi32
return
dx
;
}
private
:
int
cn
;
int
step
;
bool
use_simd
;
};
class
ResizeAreaFastVec_SIMD_16s
{
public
:
ResizeAreaFastVec_SIMD_16s
(
int
_cn
,
int
_step
)
:
cn
(
_cn
),
step
(
_step
)
{
use_simd
=
checkHardwareSupport
(
CV_CPU_SSE2
);
}
cn
(
_cn
),
step
(
_step
)
{}
int
operator
()
(
const
short
*
S
,
short
*
D
,
int
w
)
const
{
if
(
!
use_simd
)
return
0
;
int
dx
=
0
;
const
short
*
S0
=
(
const
short
*
)
S
;
const
short
*
S1
=
(
const
short
*
)((
const
uchar
*
)(
S
)
+
step
);
__m128i
masklow
=
_mm_set1_epi32
(
0x0000ffff
);
__m128i
zero
=
_mm_setzero_si128
();
__m128i
delta2
=
_mm_set1_epi32
(
2
);
if
(
cn
==
1
)
{
for
(
;
dx
<=
w
-
4
;
dx
+=
4
,
S0
+=
8
,
S1
+=
8
,
D
+=
4
)
v_int32
masklow
=
vx_setall_s32
(
0x0000ffff
);
for
(;
dx
<=
w
-
v_int32
::
nlanes
;
dx
+=
v_int32
::
nlanes
,
S0
+=
v_int16
::
nlanes
,
S1
+=
v_int16
::
nlanes
,
D
+=
v_int32
::
nlanes
)
{
__m128i
r0
=
_mm_loadu_si128
((
const
__m128i
*
)
S0
);
__m128i
r1
=
_mm_loadu_si128
((
const
__m128i
*
)
S1
);
__m128i
s0
=
_mm_add_epi32
(
_mm_srai_epi32
(
r0
,
16
),
_mm_srai_epi32
(
_mm_slli_epi32
(
_mm_and_si128
(
r0
,
masklow
),
16
),
16
));
__m128i
s1
=
_mm_add_epi32
(
_mm_srai_epi32
(
r1
,
16
),
_mm_srai_epi32
(
_mm_slli_epi32
(
_mm_and_si128
(
r1
,
masklow
),
16
),
16
));
s0
=
_mm_add_epi32
(
_mm_add_epi32
(
s0
,
s1
),
delta2
);
s0
=
_mm_srai_epi32
(
s0
,
2
);
s0
=
_mm_packs_epi32
(
s0
,
zero
);
_mm_storel_epi64
((
__m128i
*
)
D
,
s0
);
v_int32
r0
=
v_reinterpret_as_s32
(
vx_load
(
S0
));
v_int32
r1
=
v_reinterpret_as_s32
(
vx_load
(
S1
));
v_rshr_pack_store
<
2
>
(
D
,
(
r0
>>
16
)
+
(((
r0
&
masklow
)
<<
16
)
>>
16
)
+
(
r1
>>
16
)
+
(((
r1
&
masklow
)
<<
16
)
>>
16
));
}
}
else
if
(
cn
==
3
)
{
#if CV_SIMD_WIDTH == 16
for
(
;
dx
<=
w
-
4
;
dx
+=
3
,
S0
+=
6
,
S1
+=
6
,
D
+=
3
)
v_rshr_pack_store
<
2
>
(
D
,
v_load_expand
(
S0
)
+
v_load_expand
(
S0
+
3
)
+
v_load_expand
(
S1
)
+
v_load_expand
(
S1
+
3
));
#elif CV_SIMD_WIDTH == 32 || CV_SIMD_WIDTH == 64
for
(
;
dx
<=
w
-
3
*
v_int16
::
nlanes
;
dx
+=
3
*
v_int16
::
nlanes
,
S0
+=
6
*
v_int16
::
nlanes
,
S1
+=
6
*
v_int16
::
nlanes
,
D
+=
3
*
v_int16
::
nlanes
)
{
v_int32
t0
,
t1
,
t2
,
t3
,
t4
,
t5
;
v_int32
s0
,
s1
,
s2
,
s3
,
s4
,
s5
;
s0
=
vx_load_expand
(
S0
)
+
vx_load_expand
(
S1
);
s1
=
vx_load_expand
(
S0
+
v_int32
::
nlanes
)
+
vx_load_expand
(
S1
+
v_int32
::
nlanes
);
s2
=
vx_load_expand
(
S0
+
2
*
v_int32
::
nlanes
)
+
vx_load_expand
(
S1
+
2
*
v_int32
::
nlanes
);
s3
=
vx_load_expand
(
S0
+
3
*
v_int32
::
nlanes
)
+
vx_load_expand
(
S1
+
3
*
v_int32
::
nlanes
);
s4
=
vx_load_expand
(
S0
+
4
*
v_int32
::
nlanes
)
+
vx_load_expand
(
S1
+
4
*
v_int32
::
nlanes
);
s5
=
vx_load_expand
(
S0
+
5
*
v_int32
::
nlanes
)
+
vx_load_expand
(
S1
+
5
*
v_int32
::
nlanes
);
v_zip
(
s0
,
s3
,
t0
,
t1
);
v_zip
(
s1
,
s4
,
t2
,
t3
);
v_zip
(
s2
,
s5
,
t4
,
t5
);
v_zip
(
t0
,
t3
,
s0
,
s1
);
v_zip
(
t1
,
t4
,
s2
,
s3
);
v_zip
(
t2
,
t5
,
s4
,
s5
);
v_int32
bl
,
gl
,
rl
;
v_zip
(
s0
,
s3
,
t0
,
t1
);
v_zip
(
s1
,
s4
,
t2
,
t3
);
v_zip
(
s2
,
s5
,
t4
,
t5
);
#if CV_SIMD_WIDTH == 32
bl
=
t0
+
t3
;
gl
=
t1
+
t4
;
rl
=
t2
+
t5
;
#else //CV_SIMD_WIDTH == 64
v_zip
(
t0
,
t3
,
s0
,
s1
);
v_zip
(
t1
,
t4
,
s2
,
s3
);
v_zip
(
t2
,
t5
,
s4
,
s5
);
bl
=
s0
+
s3
;
gl
=
s1
+
s4
;
rl
=
s2
+
s5
;
#endif
s0
=
vx_load_expand
(
S0
+
6
*
v_int32
::
nlanes
)
+
vx_load_expand
(
S1
+
6
*
v_int32
::
nlanes
);
s1
=
vx_load_expand
(
S0
+
7
*
v_int32
::
nlanes
)
+
vx_load_expand
(
S1
+
7
*
v_int32
::
nlanes
);
s2
=
vx_load_expand
(
S0
+
8
*
v_int32
::
nlanes
)
+
vx_load_expand
(
S1
+
8
*
v_int32
::
nlanes
);
s3
=
vx_load_expand
(
S0
+
9
*
v_int32
::
nlanes
)
+
vx_load_expand
(
S1
+
9
*
v_int32
::
nlanes
);
s4
=
vx_load_expand
(
S0
+
10
*
v_int32
::
nlanes
)
+
vx_load_expand
(
S1
+
10
*
v_int32
::
nlanes
);
s5
=
vx_load_expand
(
S0
+
11
*
v_int32
::
nlanes
)
+
vx_load_expand
(
S1
+
11
*
v_int32
::
nlanes
);
v_zip
(
s0
,
s3
,
t0
,
t1
);
v_zip
(
s1
,
s4
,
t2
,
t3
);
v_zip
(
s2
,
s5
,
t4
,
t5
);
v_zip
(
t0
,
t3
,
s0
,
s1
);
v_zip
(
t1
,
t4
,
s2
,
s3
);
v_zip
(
t2
,
t5
,
s4
,
s5
);
v_int32
bh
,
gh
,
rh
;
v_zip
(
s0
,
s3
,
t0
,
t1
);
v_zip
(
s1
,
s4
,
t2
,
t3
);
v_zip
(
s2
,
s5
,
t4
,
t5
);
#if CV_SIMD_WIDTH == 32
bh
=
t0
+
t3
;
gh
=
t1
+
t4
;
rh
=
t2
+
t5
;
#else //CV_SIMD_WIDTH == 64
v_zip
(
t0
,
t3
,
s0
,
s1
);
v_zip
(
t1
,
t4
,
s2
,
s3
);
v_zip
(
t2
,
t5
,
s4
,
s5
);
bh
=
s0
+
s3
;
gh
=
s1
+
s4
;
rh
=
s2
+
s5
;
#endif
v_store_interleave
(
D
,
v_rshr_pack
<
2
>
(
bl
,
bh
),
v_rshr_pack
<
2
>
(
gl
,
gh
),
v_rshr_pack
<
2
>
(
rl
,
rh
));
}
#elif CV_SIMD_WIDTH >= 64
for
(
;
dx
<=
w
-
3
*
v_int16
::
nlanes
;
dx
+=
3
*
v_int16
::
nlanes
,
S0
+=
6
*
v_int16
::
nlanes
,
S1
+=
6
*
v_int16
::
nlanes
,
D
+=
3
*
v_int16
::
nlanes
)
{
__m128i
r0
=
_mm_loadu_si128
((
const
__m128i
*
)
S0
);
__m128i
r1
=
_mm_loadu_si128
((
const
__m128i
*
)
S1
);
__m128i
r0_16l
=
_mm_srai_epi32
(
_mm_unpacklo_epi16
(
zero
,
r0
),
16
);
__m128i
r0_16h
=
_mm_srai_epi32
(
_mm_unpacklo_epi16
(
zero
,
_mm_srli_si128
(
r0
,
6
)),
16
);
__m128i
r1_16l
=
_mm_srai_epi32
(
_mm_unpacklo_epi16
(
zero
,
r1
),
16
);
__m128i
r1_16h
=
_mm_srai_epi32
(
_mm_unpacklo_epi16
(
zero
,
_mm_srli_si128
(
r1
,
6
)),
16
);
__m128i
s0
=
_mm_add_epi32
(
r0_16l
,
r0_16h
);
__m128i
s1
=
_mm_add_epi32
(
r1_16l
,
r1_16h
);
s0
=
_mm_add_epi32
(
delta2
,
_mm_add_epi32
(
s0
,
s1
));
s0
=
_mm_packs_epi32
(
_mm_srai_epi32
(
s0
,
2
),
zero
);
_mm_storel_epi64
((
__m128i
*
)
D
,
s0
);
v_int16
b0
,
g0
,
r0
,
b1
,
g1
,
r1
;
v_load_deinterleave
(
S0
,
b0
,
g0
,
r0
);
v_load_deinterleave
(
S1
,
b1
,
g1
,
r1
);
v_int32
bl
=
(
v_reinterpret_as_s32
(
b0
)
>>
16
)
+
((
v_reinterpret_as_s32
(
b0
)
<<
16
)
>>
16
)
+
(
v_reinterpret_as_s32
(
b1
)
>>
16
)
+
((
v_reinterpret_as_s32
(
b1
)
<<
16
)
>>
16
);
v_int32
gl
=
(
v_reinterpret_as_s32
(
g0
)
>>
16
)
+
((
v_reinterpret_as_s32
(
g0
)
<<
16
)
>>
16
)
+
(
v_reinterpret_as_s32
(
g1
)
>>
16
)
+
((
v_reinterpret_as_s32
(
g1
)
<<
16
)
>>
16
);
v_int32
rl
=
(
v_reinterpret_as_s32
(
r0
)
>>
16
)
+
((
v_reinterpret_as_s32
(
r0
)
<<
16
)
>>
16
)
+
(
v_reinterpret_as_s32
(
r1
)
>>
16
)
+
((
v_reinterpret_as_s32
(
r1
)
<<
16
)
>>
16
);
v_load_deinterleave
(
S0
+
3
*
v_int16
::
nlanes
,
b0
,
g0
,
r0
);
v_load_deinterleave
(
S1
+
3
*
v_int16
::
nlanes
,
b1
,
g1
,
r1
);
v_int32
bh
=
(
v_reinterpret_as_s32
(
b0
)
>>
16
)
+
((
v_reinterpret_as_s32
(
b0
)
<<
16
)
>>
16
)
+
(
v_reinterpret_as_s32
(
b1
)
>>
16
)
+
((
v_reinterpret_as_s32
(
b1
)
<<
16
)
>>
16
);
v_int32
gh
=
(
v_reinterpret_as_s32
(
g0
)
>>
16
)
+
((
v_reinterpret_as_s32
(
g0
)
<<
16
)
>>
16
)
+
(
v_reinterpret_as_s32
(
g1
)
>>
16
)
+
((
v_reinterpret_as_s32
(
g1
)
<<
16
)
>>
16
);
v_int32
rh
=
(
v_reinterpret_as_s32
(
r0
)
>>
16
)
+
((
v_reinterpret_as_s32
(
r0
)
<<
16
)
>>
16
)
+
(
v_reinterpret_as_s32
(
r1
)
>>
16
)
+
((
v_reinterpret_as_s32
(
r1
)
<<
16
)
>>
16
);
v_store_interleave
(
D
,
v_rshr_pack
<
2
>
(
bl
,
bh
),
v_rshr_pack
<
2
>
(
gl
,
gh
),
v_rshr_pack
<
2
>
(
rl
,
rh
));
}
#endif
}
else
{
CV_Assert
(
cn
==
4
);
for
(
;
dx
<=
w
-
4
;
dx
+=
4
,
S0
+=
8
,
S1
+=
8
,
D
+=
4
)
for
(
;
dx
<=
w
-
v_int16
::
nlanes
;
dx
+=
v_int16
::
nlanes
,
S0
+=
2
*
v_int16
::
nlanes
,
S1
+=
2
*
v_int16
::
nlanes
,
D
+=
v_int16
::
nlanes
)
{
__m128i
r0
=
_mm_loadu_si128
((
const
__m128i
*
)
S0
);
__m128i
r1
=
_mm_loadu_si128
((
const
__m128i
*
)
S1
);
__m128i
r0_32l
=
_mm_srai_epi32
(
_mm_unpacklo_epi16
(
zero
,
r0
),
16
);
__m128i
r0_32h
=
_mm_srai_epi32
(
_mm_unpackhi_epi16
(
zero
,
r0
),
16
);
__m128i
r1_32l
=
_mm_srai_epi32
(
_mm_unpacklo_epi16
(
zero
,
r1
),
16
);
__m128i
r1_32h
=
_mm_srai_epi32
(
_mm_unpackhi_epi16
(
zero
,
r1
),
16
);
__m128i
s0
=
_mm_add_epi32
(
r0_32l
,
r0_32h
);
__m128i
s1
=
_mm_add_epi32
(
r1_32l
,
r1_32h
);
s0
=
_mm_add_epi32
(
s1
,
_mm_add_epi32
(
s0
,
delta2
));
s0
=
_mm_packs_epi32
(
_mm_srai_epi32
(
s0
,
2
),
zero
);
_mm_storel_epi64
((
__m128i
*
)
D
,
s0
);
#if CV_SIMD_WIDTH >= 64
v_int64
r00
,
r01
,
r10
,
r11
;
v_load_deinterleave
((
int64_t
*
)
S0
,
r00
,
r01
);
v_load_deinterleave
((
int64_t
*
)
S1
,
r10
,
r11
);
v_int32
r00l
,
r01l
,
r10l
,
r11l
,
r00h
,
r01h
,
r10h
,
r11h
;
v_expand
(
v_reinterpret_as_s16
(
r00
),
r00l
,
r00h
);
v_expand
(
v_reinterpret_as_s16
(
r01
),
r01l
,
r01h
);
v_expand
(
v_reinterpret_as_s16
(
r10
),
r10l
,
r10h
);
v_expand
(
v_reinterpret_as_s16
(
r11
),
r11l
,
r11h
);
v_store
(
D
,
v_rshr_pack
<
2
>
(
r00l
+
r01l
+
r10l
+
r11l
,
r00h
+
r01h
+
r10h
+
r11h
));
#else
v_int32
r0
,
r1
,
r2
,
r3
;
r0
=
vx_load_expand
(
S0
)
+
vx_load_expand
(
S1
);
r1
=
vx_load_expand
(
S0
+
v_int32
::
nlanes
)
+
vx_load_expand
(
S1
+
v_int32
::
nlanes
);
r2
=
vx_load_expand
(
S0
+
2
*
v_int32
::
nlanes
)
+
vx_load_expand
(
S1
+
2
*
v_int32
::
nlanes
);
r3
=
vx_load_expand
(
S0
+
3
*
v_int32
::
nlanes
)
+
vx_load_expand
(
S1
+
3
*
v_int32
::
nlanes
);
v_int32
dl
,
dh
;
#if CV_SIMD_WIDTH == 16
dl
=
r0
+
r1
;
dh
=
r2
+
r3
;
#elif CV_SIMD_WIDTH == 32
v_int32
t0
,
t1
,
t2
,
t3
;
v_recombine
(
r0
,
r1
,
t0
,
t1
);
v_recombine
(
r2
,
r3
,
t2
,
t3
);
dl
=
t0
+
t1
;
dh
=
t2
+
t3
;
#endif
v_store
(
D
,
v_rshr_pack
<
2
>
(
dl
,
dh
));
#endif
}
}
...
...
@@ -2967,7 +2530,6 @@ public:
private
:
int
cn
;
int
step
;
bool
use_simd
;
};
struct
ResizeAreaFastVec_SIMD_32f
...
...
@@ -2976,7 +2538,6 @@ struct ResizeAreaFastVec_SIMD_32f
cn
(
_cn
),
step
(
_step
)
{
fast_mode
=
_scale_x
==
2
&&
_scale_y
==
2
&&
(
cn
==
1
||
cn
==
4
);
fast_mode
=
fast_mode
&&
checkHardwareSupport
(
CV_CPU_SSE2
);
}
int
operator
()
(
const
float
*
S
,
float
*
D
,
int
w
)
const
...
...
@@ -2987,33 +2548,32 @@ struct ResizeAreaFastVec_SIMD_32f
const
float
*
S0
=
S
,
*
S1
=
(
const
float
*
)((
const
uchar
*
)(
S0
)
+
step
);
int
dx
=
0
;
__m128
v_025
=
_mm_set1_ps
(
0.25
f
);
if
(
cn
==
1
)
{
const
int
shuffle_lo
=
_MM_SHUFFLE
(
2
,
0
,
2
,
0
),
shuffle_hi
=
_MM_SHUFFLE
(
3
,
1
,
3
,
1
);
for
(
;
dx
<=
w
-
4
;
dx
+=
4
,
S0
+=
8
,
S1
+=
8
,
D
+=
4
)
v_float32
v_025
=
vx_setall_f32
(
0.25
f
);
for
(
;
dx
<=
w
-
v_float32
::
nlanes
;
dx
+=
v_float32
::
nlanes
,
S0
+=
2
*
v_float32
::
nlanes
,
S1
+=
2
*
v_float32
::
nlanes
,
D
+=
v_float32
::
nlanes
)
{
__m128
v_row00
=
_mm_loadu_ps
(
S0
),
v_row01
=
_mm_loadu_ps
(
S0
+
4
),
v_row10
=
_mm_loadu_ps
(
S1
),
v_row11
=
_mm_loadu_ps
(
S1
+
4
);
__m128
v_dst0
=
_mm_add_ps
(
_mm_shuffle_ps
(
v_row00
,
v_row01
,
shuffle_lo
),
_mm_shuffle_ps
(
v_row00
,
v_row01
,
shuffle_hi
));
__m128
v_dst1
=
_mm_add_ps
(
_mm_shuffle_ps
(
v_row10
,
v_row11
,
shuffle_lo
),
_mm_shuffle_ps
(
v_row10
,
v_row11
,
shuffle_hi
));
_mm_storeu_ps
(
D
,
_mm_mul_ps
(
_mm_add_ps
(
v_dst0
,
v_dst1
),
v_025
));
v_float32
v_row00
,
v_row01
,
v_row10
,
v_row11
;
v_load_deinterleave
(
S0
,
v_row00
,
v_row01
);
v_load_deinterleave
(
S1
,
v_row10
,
v_row11
);
v_store
(
D
,
((
v_row00
+
v_row01
)
+
(
v_row10
+
v_row11
))
*
v_025
);
}
}
else
if
(
cn
==
4
)
{
for
(
;
dx
<=
w
-
4
;
dx
+=
4
,
S0
+=
8
,
S1
+=
8
,
D
+=
4
)
#if CV_SIMD_WIDTH == 16
v_float32
v_025
=
vx_setall_f32
(
0.25
f
);
for
(;
dx
<=
w
-
v_float32
::
nlanes
;
dx
+=
v_float32
::
nlanes
,
S0
+=
2
*
v_float32
::
nlanes
,
S1
+=
2
*
v_float32
::
nlanes
,
D
+=
v_float32
::
nlanes
)
v_store
(
D
,
((
vx_load
(
S0
)
+
vx_load
(
S0
+
v_float32
::
nlanes
))
+
(
vx_load
(
S1
)
+
vx_load
(
S1
+
v_float32
::
nlanes
)))
*
v_025
);
#elif CV_SIMD256
v_float32x8
v_025
=
v256_setall_f32
(
0.25
f
);
for
(;
dx
<=
w
-
v_float32x8
::
nlanes
;
dx
+=
v_float32x8
::
nlanes
,
S0
+=
2
*
v_float32x8
::
nlanes
,
S1
+=
2
*
v_float32x8
::
nlanes
,
D
+=
v_float32x8
::
nlanes
)
{
__m128
v_dst0
=
_mm_add_ps
(
_mm_loadu_ps
(
S0
),
_mm_loadu_ps
(
S0
+
4
));
__m128
v_dst1
=
_mm_add_ps
(
_mm_loadu_ps
(
S1
),
_mm_loadu_ps
(
S1
+
4
));
_mm_storeu_ps
(
D
,
_mm_mul_ps
(
_mm_add_ps
(
v_dst0
,
v_dst1
),
v_025
));
v_float32x8
dst0
,
dst1
;
v_recombine
(
v256_load
(
S0
)
+
v256_load
(
S1
),
v256_load
(
S0
+
v_float32x8
::
nlanes
)
+
v256_load
(
S1
+
v_float32x8
::
nlanes
),
dst0
,
dst1
);
v_store
(
D
,
(
dst0
+
dst1
)
*
v_025
);
}
#endif
}
return
dx
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment