Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
fadf25ac
Commit
fadf25ac
authored
Jul 03, 2017
by
Vitaly Tuzov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
SSE4_1 optimized implementation of resize and warp functions migrated to separate file
parent
3681dcef
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
548 additions
and
406 deletions
+548
-406
imgwarp.avx2.cpp
modules/imgproc/src/imgwarp.avx2.cpp
+10
-2
imgwarp.cpp
modules/imgproc/src/imgwarp.cpp
+33
-402
imgwarp.hpp
modules/imgproc/src/imgwarp.hpp
+17
-1
imgwarp.sse4_1.cpp
modules/imgproc/src/imgwarp.sse4_1.cpp
+488
-1
No files found.
modules/imgproc/src/imgwarp.avx2.cpp
View file @
fadf25ac
...
...
@@ -83,7 +83,9 @@ public:
uchar
*
Dstart
=
D
;
int
sy
=
std
::
min
(
cvFloor
(
y
*
ify
),
ssize
.
height
-
1
);
const
uchar
*
S
=
src
.
data
+
sy
*
src
.
step
;
#ifdef CV_ICC
#pragma unroll(4)
#endif
for
(
x
=
0
;
x
<
avxWidth
;
x
+=
8
)
{
const
__m256i
CV_DECL_ALIGNED
(
64
)
*
addr
=
(
__m256i
*
)(
x_ofs
+
x
);
...
...
@@ -106,7 +108,9 @@ public:
uchar
*
Dstart
=
D
;
int
sy
=
std
::
min
(
cvFloor
(
y
*
ify
),
ssize
.
height
-
1
);
const
uchar
*
S
=
src
.
data
+
sy
*
src
.
step
;
#ifdef CV_ICC
#pragma unroll(4)
#endif
for
(
x
=
0
;
x
<
avxWidth
;
x
+=
8
)
{
const
__m256i
CV_DECL_ALIGNED
(
64
)
*
addr
=
(
__m256i
*
)(
x_ofs
+
x
);
...
...
@@ -157,8 +161,8 @@ public:
const
__m256i
CV_DECL_ALIGNED
(
64
)
shuffle_mask
=
_mm256_set_epi8
(
15
,
14
,
11
,
10
,
13
,
12
,
9
,
8
,
7
,
6
,
3
,
2
,
5
,
4
,
1
,
0
,
15
,
14
,
11
,
10
,
13
,
12
,
9
,
8
,
7
,
6
,
3
,
2
,
5
,
4
,
1
,
0
);
const
__m256i
CV_DECL_ALIGNED
(
64
)
permute_mask
=
_mm256_set_epi32
(
7
,
5
,
3
,
1
,
6
,
4
,
2
,
0
);
const
__m256i
CV_DECL_ALIGNED
(
64
)
shift_shuffle_mask
=
_mm256_set_epi8
(
13
,
12
,
15
,
14
,
9
,
8
,
11
,
10
,
5
,
4
,
7
,
6
,
1
,
0
,
3
,
2
,
13
,
12
,
15
,
14
,
9
,
8
,
11
,
10
,
5
,
4
,
7
,
6
,
1
,
0
,
3
,
2
);
//
const __m256i CV_DECL_ALIGNED(64) shift_shuffle_mask = _mm256_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2,
//
13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2);
if
(((
int64
)(
dst
.
data
+
dst
.
step
)
&
0x1f
)
==
0
)
{
for
(
y
=
range
.
start
;
y
<
range
.
end
;
y
++
)
...
...
@@ -168,7 +172,9 @@ public:
int
sy
=
std
::
min
(
cvFloor
(
y
*
ify
),
ssize
.
height
-
1
);
const
uchar
*
S
=
src
.
data
+
sy
*
src
.
step
;
const
uchar
*
S2
=
S
-
2
;
#ifdef CV_ICC
#pragma unroll(4)
#endif
for
(
x
=
0
;
x
<
avxWidth
;
x
+=
16
)
{
const
__m256i
CV_DECL_ALIGNED
(
64
)
*
addr
=
(
__m256i
*
)(
x_ofs
+
x
);
...
...
@@ -200,7 +206,9 @@ public:
int
sy
=
std
::
min
(
cvFloor
(
y
*
ify
),
ssize
.
height
-
1
);
const
uchar
*
S
=
src
.
data
+
sy
*
src
.
step
;
const
uchar
*
S2
=
S
-
2
;
#ifdef CV_ICC
#pragma unroll(4)
#endif
for
(
x
=
0
;
x
<
avxWidth
;
x
+=
16
)
{
const
__m256i
CV_DECL_ALIGNED
(
64
)
*
addr
=
(
__m256i
*
)(
x_ofs
+
x
);
...
...
modules/imgproc/src/imgwarp.cpp
View file @
fadf25ac
...
...
@@ -450,9 +450,9 @@ resizeNN( const Mat& src, Mat& dst, double fx, double fy )
if
(
CV_CPU_HAS_SUPPORT_SSE4_1
&&
((
pix_size
==
2
)
||
(
pix_size
==
4
)))
{
if
(
pix_size
==
2
)
opt_SSE41
::
resizeNN2_SSE4_1
(
range
,
src
,
dst
,
x_ofs
,
pix_size4
,
ify
);
opt_SSE4
_
1
::
resizeNN2_SSE4_1
(
range
,
src
,
dst
,
x_ofs
,
pix_size4
,
ify
);
else
opt_SSE41
::
resizeNN4_SSE4_1
(
range
,
src
,
dst
,
x_ofs
,
pix_size4
,
ify
);
opt_SSE4
_
1
::
resizeNN4_SSE4_1
(
range
,
src
,
dst
,
x_ofs
,
pix_size4
,
ify
);
}
else
#endif
...
...
@@ -916,50 +916,14 @@ struct VResizeCubicVec_32f
}
};
#if CV_SSE4_1
#if CV_
TRY_
SSE4_1
struct
VResizeLanczos4Vec_32f16u
{
int
operator
()(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
const
{
const
float
**
src
=
(
const
float
**
)
_src
;
const
float
*
beta
=
(
const
float
*
)
_beta
;
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
],
*
S2
=
src
[
2
],
*
S3
=
src
[
3
],
*
S4
=
src
[
4
],
*
S5
=
src
[
5
],
*
S6
=
src
[
6
],
*
S7
=
src
[
7
];
short
*
dst
=
(
short
*
)
_dst
;
int
x
=
0
;
__m128
v_b0
=
_mm_set1_ps
(
beta
[
0
]),
v_b1
=
_mm_set1_ps
(
beta
[
1
]),
v_b2
=
_mm_set1_ps
(
beta
[
2
]),
v_b3
=
_mm_set1_ps
(
beta
[
3
]),
v_b4
=
_mm_set1_ps
(
beta
[
4
]),
v_b5
=
_mm_set1_ps
(
beta
[
5
]),
v_b6
=
_mm_set1_ps
(
beta
[
6
]),
v_b7
=
_mm_set1_ps
(
beta
[
7
]);
for
(
;
x
<=
width
-
8
;
x
+=
8
)
{
__m128
v_dst0
=
_mm_mul_ps
(
v_b0
,
_mm_loadu_ps
(
S0
+
x
));
v_dst0
=
_mm_add_ps
(
v_dst0
,
_mm_mul_ps
(
v_b1
,
_mm_loadu_ps
(
S1
+
x
)));
v_dst0
=
_mm_add_ps
(
v_dst0
,
_mm_mul_ps
(
v_b2
,
_mm_loadu_ps
(
S2
+
x
)));
v_dst0
=
_mm_add_ps
(
v_dst0
,
_mm_mul_ps
(
v_b3
,
_mm_loadu_ps
(
S3
+
x
)));
v_dst0
=
_mm_add_ps
(
v_dst0
,
_mm_mul_ps
(
v_b4
,
_mm_loadu_ps
(
S4
+
x
)));
v_dst0
=
_mm_add_ps
(
v_dst0
,
_mm_mul_ps
(
v_b5
,
_mm_loadu_ps
(
S5
+
x
)));
v_dst0
=
_mm_add_ps
(
v_dst0
,
_mm_mul_ps
(
v_b6
,
_mm_loadu_ps
(
S6
+
x
)));
v_dst0
=
_mm_add_ps
(
v_dst0
,
_mm_mul_ps
(
v_b7
,
_mm_loadu_ps
(
S7
+
x
)));
__m128
v_dst1
=
_mm_mul_ps
(
v_b0
,
_mm_loadu_ps
(
S0
+
x
+
4
));
v_dst1
=
_mm_add_ps
(
v_dst1
,
_mm_mul_ps
(
v_b1
,
_mm_loadu_ps
(
S1
+
x
+
4
)));
v_dst1
=
_mm_add_ps
(
v_dst1
,
_mm_mul_ps
(
v_b2
,
_mm_loadu_ps
(
S2
+
x
+
4
)));
v_dst1
=
_mm_add_ps
(
v_dst1
,
_mm_mul_ps
(
v_b3
,
_mm_loadu_ps
(
S3
+
x
+
4
)));
v_dst1
=
_mm_add_ps
(
v_dst1
,
_mm_mul_ps
(
v_b4
,
_mm_loadu_ps
(
S4
+
x
+
4
)));
v_dst1
=
_mm_add_ps
(
v_dst1
,
_mm_mul_ps
(
v_b5
,
_mm_loadu_ps
(
S5
+
x
+
4
)));
v_dst1
=
_mm_add_ps
(
v_dst1
,
_mm_mul_ps
(
v_b6
,
_mm_loadu_ps
(
S6
+
x
+
4
)));
v_dst1
=
_mm_add_ps
(
v_dst1
,
_mm_mul_ps
(
v_b7
,
_mm_loadu_ps
(
S7
+
x
+
4
)));
__m128i
v_dsti0
=
_mm_cvtps_epi32
(
v_dst0
);
__m128i
v_dsti1
=
_mm_cvtps_epi32
(
v_dst1
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
x
),
_mm_packus_epi32
(
v_dsti0
,
v_dsti1
));
}
return
x
;
if
(
CV_CPU_HAS_SUPPORT_SSE4_1
)
return
opt_SSE4_1
::
VResizeLanczos4Vec_32f16u_SSE41
(
_src
,
_dst
,
_beta
,
width
);
else
return
0
;
}
};
...
...
@@ -5149,8 +5113,8 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
#if CV_SSE2
bool
useSSE2
=
checkHardwareSupport
(
CV_CPU_SSE2
);
#endif
#if CV_SSE4_1
bool
useSSE4_1
=
checkHardwareSupport
(
CV_CPU_SSE4_1
)
;
#if CV_
TRY_
SSE4_1
bool
useSSE4_1
=
CV_CPU_HAS_SUPPORT_SSE4_1
;
#endif
const
float
scale
=
1.
f
/
INTER_TAB_SIZE
;
...
...
@@ -5183,29 +5147,10 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
vst2q_s16
(
dst1
+
(
x
<<
1
),
v_dst
);
}
#elif CV_SSE4_1
#elif CV_
TRY_
SSE4_1
if
(
useSSE4_1
)
{
for
(
;
x
<=
size
.
width
-
16
;
x
+=
16
)
{
__m128i
v_dst0
=
_mm_packs_epi32
(
_mm_cvtps_epi32
(
_mm_loadu_ps
(
src1f
+
x
)),
_mm_cvtps_epi32
(
_mm_loadu_ps
(
src1f
+
x
+
4
)));
__m128i
v_dst1
=
_mm_packs_epi32
(
_mm_cvtps_epi32
(
_mm_loadu_ps
(
src1f
+
x
+
8
)),
_mm_cvtps_epi32
(
_mm_loadu_ps
(
src1f
+
x
+
12
)));
__m128i
v_dst2
=
_mm_packs_epi32
(
_mm_cvtps_epi32
(
_mm_loadu_ps
(
src2f
+
x
)),
_mm_cvtps_epi32
(
_mm_loadu_ps
(
src2f
+
x
+
4
)));
__m128i
v_dst3
=
_mm_packs_epi32
(
_mm_cvtps_epi32
(
_mm_loadu_ps
(
src2f
+
x
+
8
)),
_mm_cvtps_epi32
(
_mm_loadu_ps
(
src2f
+
x
+
12
)));
_mm_interleave_epi16
(
v_dst0
,
v_dst1
,
v_dst2
,
v_dst3
);
_mm_storeu_si128
((
__m128i
*
)(
dst1
+
x
*
2
),
v_dst0
);
_mm_storeu_si128
((
__m128i
*
)(
dst1
+
x
*
2
+
8
),
v_dst1
);
_mm_storeu_si128
((
__m128i
*
)(
dst1
+
x
*
2
+
16
),
v_dst2
);
_mm_storeu_si128
((
__m128i
*
)(
dst1
+
x
*
2
+
24
),
v_dst3
);
}
}
opt_SSE4_1
::
convertMaps_nninterpolate32f1c16s_SSE41
(
src1f
,
src2f
,
dst1
,
size
.
width
);
else
#endif
for
(
;
x
<
size
.
width
;
x
++
)
{
...
...
@@ -5240,52 +5185,10 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
vandq_s32
(
v_ix1
,
v_mask
)));
vst1q_u16
(
dst2
+
x
,
vcombine_u16
(
v_dst0
,
v_dst1
));
}
#elif CV_SSE4_1
#elif CV_
TRY_
SSE4_1
if
(
useSSE4_1
)
{
__m128
v_its
=
_mm_set1_ps
(
INTER_TAB_SIZE
);
__m128i
v_its1
=
_mm_set1_epi32
(
INTER_TAB_SIZE
-
1
);
for
(
;
x
<=
size
.
width
-
16
;
x
+=
16
)
{
__m128i
v_ix0
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
_mm_loadu_ps
(
src1f
+
x
),
v_its
));
__m128i
v_ix1
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
_mm_loadu_ps
(
src1f
+
x
+
4
),
v_its
));
__m128i
v_iy0
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
_mm_loadu_ps
(
src2f
+
x
),
v_its
));
__m128i
v_iy1
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
_mm_loadu_ps
(
src2f
+
x
+
4
),
v_its
));
__m128i
v_dst10
=
_mm_packs_epi32
(
_mm_srai_epi32
(
v_ix0
,
INTER_BITS
),
_mm_srai_epi32
(
v_ix1
,
INTER_BITS
));
__m128i
v_dst12
=
_mm_packs_epi32
(
_mm_srai_epi32
(
v_iy0
,
INTER_BITS
),
_mm_srai_epi32
(
v_iy1
,
INTER_BITS
));
__m128i
v_dst20
=
_mm_add_epi32
(
_mm_slli_epi32
(
_mm_and_si128
(
v_iy0
,
v_its1
),
INTER_BITS
),
_mm_and_si128
(
v_ix0
,
v_its1
));
__m128i
v_dst21
=
_mm_add_epi32
(
_mm_slli_epi32
(
_mm_and_si128
(
v_iy1
,
v_its1
),
INTER_BITS
),
_mm_and_si128
(
v_ix1
,
v_its1
));
_mm_storeu_si128
((
__m128i
*
)(
dst2
+
x
),
_mm_packus_epi32
(
v_dst20
,
v_dst21
));
v_ix0
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
_mm_loadu_ps
(
src1f
+
x
+
8
),
v_its
));
v_ix1
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
_mm_loadu_ps
(
src1f
+
x
+
12
),
v_its
));
v_iy0
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
_mm_loadu_ps
(
src2f
+
x
+
8
),
v_its
));
v_iy1
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
_mm_loadu_ps
(
src2f
+
x
+
12
),
v_its
));
__m128i
v_dst11
=
_mm_packs_epi32
(
_mm_srai_epi32
(
v_ix0
,
INTER_BITS
),
_mm_srai_epi32
(
v_ix1
,
INTER_BITS
));
__m128i
v_dst13
=
_mm_packs_epi32
(
_mm_srai_epi32
(
v_iy0
,
INTER_BITS
),
_mm_srai_epi32
(
v_iy1
,
INTER_BITS
));
v_dst20
=
_mm_add_epi32
(
_mm_slli_epi32
(
_mm_and_si128
(
v_iy0
,
v_its1
),
INTER_BITS
),
_mm_and_si128
(
v_ix0
,
v_its1
));
v_dst21
=
_mm_add_epi32
(
_mm_slli_epi32
(
_mm_and_si128
(
v_iy1
,
v_its1
),
INTER_BITS
),
_mm_and_si128
(
v_ix1
,
v_its1
));
_mm_storeu_si128
((
__m128i
*
)(
dst2
+
x
+
8
),
_mm_packus_epi32
(
v_dst20
,
v_dst21
));
_mm_interleave_epi16
(
v_dst10
,
v_dst11
,
v_dst12
,
v_dst13
);
_mm_storeu_si128
((
__m128i
*
)(
dst1
+
x
*
2
),
v_dst10
);
_mm_storeu_si128
((
__m128i
*
)(
dst1
+
x
*
2
+
8
),
v_dst11
);
_mm_storeu_si128
((
__m128i
*
)(
dst1
+
x
*
2
+
16
),
v_dst12
);
_mm_storeu_si128
((
__m128i
*
)(
dst1
+
x
*
2
+
24
),
v_dst13
);
}
}
opt_SSE4_1
::
convertMaps_32f1c16s_SSE41
(
src1f
,
src2f
,
dst1
,
dst2
,
size
.
width
);
else
#endif
for
(
;
x
<
size
.
width
;
x
++
)
{
...
...
@@ -5346,30 +5249,10 @@ void cv::convertMaps( InputArray _map1, InputArray _map2,
vandq_s32
(
v_ix1
,
v_mask
)));
vst1q_u16
(
dst2
+
x
,
vcombine_u16
(
v_dst0
,
v_dst1
));
}
#elif CV_SSE4_1
#elif CV_
TRY_
SSE4_1
if
(
useSSE4_1
)
{
__m128
v_its
=
_mm_set1_ps
(
INTER_TAB_SIZE
);
__m128i
v_its1
=
_mm_set1_epi32
(
INTER_TAB_SIZE
-
1
);
__m128i
v_y_mask
=
_mm_set1_epi32
((
INTER_TAB_SIZE
-
1
)
<<
16
);
for
(
;
x
<=
size
.
width
-
4
;
x
+=
4
)
{
__m128i
v_src0
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
_mm_loadu_ps
(
src1f
+
x
*
2
),
v_its
));
__m128i
v_src1
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
_mm_loadu_ps
(
src1f
+
x
*
2
+
4
),
v_its
));
__m128i
v_dst1
=
_mm_packs_epi32
(
_mm_srai_epi32
(
v_src0
,
INTER_BITS
),
_mm_srai_epi32
(
v_src1
,
INTER_BITS
));
_mm_storeu_si128
((
__m128i
*
)(
dst1
+
x
*
2
),
v_dst1
);
// x0 y0 x1 y1 . . .
v_src0
=
_mm_packs_epi32
(
_mm_and_si128
(
v_src0
,
v_its1
),
_mm_and_si128
(
v_src1
,
v_its1
));
__m128i
v_dst2
=
_mm_or_si128
(
_mm_srli_epi32
(
_mm_and_si128
(
v_src0
,
v_y_mask
),
16
-
INTER_BITS
),
// y0 0 y1 0 . . .
_mm_and_si128
(
v_src0
,
v_its1
));
// 0 x0 0 x1 . . .
_mm_storel_epi64
((
__m128i
*
)(
dst2
+
x
),
_mm_packus_epi32
(
v_dst2
,
v_dst2
));
}
}
opt_SSE4_1
::
convertMaps_32f2c16s_SSE41
(
src1f
,
dst1
,
dst2
,
size
.
width
);
else
#endif
for
(
;
x
<
size
.
width
;
x
++
)
{
...
...
@@ -5557,8 +5440,8 @@ public:
#if CV_SSE2
bool
useSSE2
=
checkHardwareSupport
(
CV_CPU_SSE2
);
#endif
#if CV_SSE4_1
bool
useSSE4_1
=
checkHardwareSupport
(
CV_CPU_SSE4_1
)
;
#if CV_
TRY_
SSE4_1
bool
useSSE4_1
=
CV_CPU_HAS_SUPPORT_SSE4_1
;
#endif
int
bh0
=
std
::
min
(
BLOCK_SZ
/
2
,
dst
.
rows
);
...
...
@@ -5596,31 +5479,10 @@ public:
vst2q_s16
(
xy
+
(
x1
<<
1
),
v_dst
);
}
#elif CV_SSE4_1
#elif CV_
TRY_
SSE4_1
if
(
useSSE4_1
)
{
__m128i
v_X0
=
_mm_set1_epi32
(
X0
);
__m128i
v_Y0
=
_mm_set1_epi32
(
Y0
);
for
(
;
x1
<=
bw
-
16
;
x1
+=
16
)
{
__m128i
v_x0
=
_mm_packs_epi32
(
_mm_srai_epi32
(
_mm_add_epi32
(
v_X0
,
_mm_loadu_si128
((
__m128i
const
*
)(
adelta
+
x
+
x1
))),
AB_BITS
),
_mm_srai_epi32
(
_mm_add_epi32
(
v_X0
,
_mm_loadu_si128
((
__m128i
const
*
)(
adelta
+
x
+
x1
+
4
))),
AB_BITS
));
__m128i
v_x1
=
_mm_packs_epi32
(
_mm_srai_epi32
(
_mm_add_epi32
(
v_X0
,
_mm_loadu_si128
((
__m128i
const
*
)(
adelta
+
x
+
x1
+
8
))),
AB_BITS
),
_mm_srai_epi32
(
_mm_add_epi32
(
v_X0
,
_mm_loadu_si128
((
__m128i
const
*
)(
adelta
+
x
+
x1
+
12
))),
AB_BITS
));
__m128i
v_y0
=
_mm_packs_epi32
(
_mm_srai_epi32
(
_mm_add_epi32
(
v_Y0
,
_mm_loadu_si128
((
__m128i
const
*
)(
bdelta
+
x
+
x1
))),
AB_BITS
),
_mm_srai_epi32
(
_mm_add_epi32
(
v_Y0
,
_mm_loadu_si128
((
__m128i
const
*
)(
bdelta
+
x
+
x1
+
4
))),
AB_BITS
));
__m128i
v_y1
=
_mm_packs_epi32
(
_mm_srai_epi32
(
_mm_add_epi32
(
v_Y0
,
_mm_loadu_si128
((
__m128i
const
*
)(
bdelta
+
x
+
x1
+
8
))),
AB_BITS
),
_mm_srai_epi32
(
_mm_add_epi32
(
v_Y0
,
_mm_loadu_si128
((
__m128i
const
*
)(
bdelta
+
x
+
x1
+
12
))),
AB_BITS
));
_mm_interleave_epi16
(
v_x0
,
v_x1
,
v_y0
,
v_y1
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
),
v_x0
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
+
8
),
v_x1
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
+
16
),
v_y0
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
+
24
),
v_y1
);
}
}
opt_SSE4_1
::
WarpAffineInvoker_Blockline_SSE41
(
adelta
+
x
,
bdelta
+
x
,
xy
,
X0
,
Y0
,
bw
);
else
#endif
for
(
;
x1
<
bw
;
x1
++
)
{
...
...
@@ -6132,18 +5994,10 @@ public:
int
bw0
=
std
::
min
(
BLOCK_SZ
*
BLOCK_SZ
/
bh0
,
width
);
bh0
=
std
::
min
(
BLOCK_SZ
*
BLOCK_SZ
/
bw0
,
height
);
#if CV_SSE4_1
bool
haveSSE4_1
=
checkHardwareSupport
(
CV_CPU_SSE4_1
);
__m128d
v_M0
=
_mm_set1_pd
(
M
[
0
]);
__m128d
v_M3
=
_mm_set1_pd
(
M
[
3
]);
__m128d
v_M6
=
_mm_set1_pd
(
M
[
6
]);
__m128d
v_intmax
=
_mm_set1_pd
((
double
)
INT_MAX
);
__m128d
v_intmin
=
_mm_set1_pd
((
double
)
INT_MIN
);
__m128d
v_2
=
_mm_set1_pd
(
2
),
v_zero
=
_mm_setzero_pd
(),
v_1
=
_mm_set1_pd
(
1
),
v_its
=
_mm_set1_pd
(
INTER_TAB_SIZE
);
__m128i
v_itsi1
=
_mm_set1_epi32
(
INTER_TAB_SIZE
-
1
);
#if CV_TRY_SSE4_1
Ptr
<
opt_SSE4_1
::
WarpPerspectiveLine_SSE4
>
pwarp_impl_sse4
;
if
(
CV_CPU_HAS_SUPPORT_SSE4_1
)
pwarp_impl_sse4
=
opt_SSE4_1
::
WarpPerspectiveLine_SSE4
::
getImpl
(
M
);
#endif
for
(
y
=
range
.
start
;
y
<
range
.
end
;
y
+=
bh0
)
...
...
@@ -6167,116 +6021,11 @@ public:
{
x1
=
0
;
#if CV_SSE4_1
if
(
haveSSE4_1
)
{
__m128d
v_X0d
=
_mm_set1_pd
(
X0
);
__m128d
v_Y0d
=
_mm_set1_pd
(
Y0
);
__m128d
v_W0
=
_mm_set1_pd
(
W0
);
__m128d
v_x1
=
_mm_set_pd
(
1
,
0
);
for
(
;
x1
<=
bw
-
16
;
x1
+=
16
)
{
// 0-3
__m128i
v_X0
,
v_Y0
;
{
__m128d
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_1
,
v_W
));
__m128d
v_fX0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_1
,
v_W
));
__m128d
v_fX1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_X0
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX1
))));
v_Y0
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY1
))));
}
// 4-8
__m128i
v_X1
,
v_Y1
;
{
__m128d
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_1
,
v_W
));
__m128d
v_fX0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_1
,
v_W
));
__m128d
v_fX1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_X1
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX1
))));
v_Y1
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY1
))));
}
// 8-11
__m128i
v_X2
,
v_Y2
;
{
__m128d
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_1
,
v_W
));
__m128d
v_fX0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_1
,
v_W
));
__m128d
v_fX1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_X2
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX1
))));
v_Y2
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY1
))));
}
// 12-15
__m128i
v_X3
,
v_Y3
;
{
__m128d
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_1
,
v_W
));
__m128d
v_fX0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_1
,
v_W
));
__m128d
v_fX1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_X3
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX1
))));
v_Y3
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY1
))));
}
// convert to 16s
v_X0
=
_mm_packs_epi32
(
v_X0
,
v_X1
);
v_X1
=
_mm_packs_epi32
(
v_X2
,
v_X3
);
v_Y0
=
_mm_packs_epi32
(
v_Y0
,
v_Y1
);
v_Y1
=
_mm_packs_epi32
(
v_Y2
,
v_Y3
);
_mm_interleave_epi16
(
v_X0
,
v_X1
,
v_Y0
,
v_Y1
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
),
v_X0
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
+
8
),
v_X1
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
+
16
),
v_Y0
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
+
24
),
v_Y1
);
}
}
#if CV_TRY_SSE4_1
if
(
pwarp_impl_sse4
)
pwarp_impl_sse4
->
processNN
(
M
,
xy
,
X0
,
Y0
,
W0
,
bw
);
else
#endif
for
(
;
x1
<
bw
;
x1
++
)
{
double
W
=
W0
+
M
[
6
]
*
x1
;
...
...
@@ -6295,129 +6044,11 @@ public:
short
*
alpha
=
A
+
y1
*
bw
;
x1
=
0
;
#if CV_SSE4_1
if
(
haveSSE4_1
)
{
__m128d
v_X0d
=
_mm_set1_pd
(
X0
);
__m128d
v_Y0d
=
_mm_set1_pd
(
Y0
);
__m128d
v_W0
=
_mm_set1_pd
(
W0
);
__m128d
v_x1
=
_mm_set_pd
(
1
,
0
);
for
(
;
x1
<=
bw
-
16
;
x1
+=
16
)
{
// 0-3
__m128i
v_X0
,
v_Y0
;
{
__m128d
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_its
,
v_W
));
__m128d
v_fX0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_its
,
v_W
));
__m128d
v_fX1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_X0
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX1
))));
v_Y0
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY1
))));
}
// 4-8
__m128i
v_X1
,
v_Y1
;
{
__m128d
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_its
,
v_W
));
__m128d
v_fX0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_its
,
v_W
));
__m128d
v_fX1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_X1
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX1
))));
v_Y1
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY1
))));
}
// 8-11
__m128i
v_X2
,
v_Y2
;
{
__m128d
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_its
,
v_W
));
__m128d
v_fX0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_its
,
v_W
));
__m128d
v_fX1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_X2
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX1
))));
v_Y2
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY1
))));
}
// 12-15
__m128i
v_X3
,
v_Y3
;
{
__m128d
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_its
,
v_W
));
__m128d
v_fX0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_its
,
v_W
));
__m128d
v_fX1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_X3
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX1
))));
v_Y3
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY1
))));
}
// store alpha
__m128i
v_alpha0
=
_mm_add_epi32
(
_mm_slli_epi32
(
_mm_and_si128
(
v_Y0
,
v_itsi1
),
INTER_BITS
),
_mm_and_si128
(
v_X0
,
v_itsi1
));
__m128i
v_alpha1
=
_mm_add_epi32
(
_mm_slli_epi32
(
_mm_and_si128
(
v_Y1
,
v_itsi1
),
INTER_BITS
),
_mm_and_si128
(
v_X1
,
v_itsi1
));
_mm_storeu_si128
((
__m128i
*
)(
alpha
+
x1
),
_mm_packs_epi32
(
v_alpha0
,
v_alpha1
));
v_alpha0
=
_mm_add_epi32
(
_mm_slli_epi32
(
_mm_and_si128
(
v_Y2
,
v_itsi1
),
INTER_BITS
),
_mm_and_si128
(
v_X2
,
v_itsi1
));
v_alpha1
=
_mm_add_epi32
(
_mm_slli_epi32
(
_mm_and_si128
(
v_Y3
,
v_itsi1
),
INTER_BITS
),
_mm_and_si128
(
v_X3
,
v_itsi1
));
_mm_storeu_si128
((
__m128i
*
)(
alpha
+
x1
+
8
),
_mm_packs_epi32
(
v_alpha0
,
v_alpha1
));
// convert to 16s
v_X0
=
_mm_packs_epi32
(
_mm_srai_epi32
(
v_X0
,
INTER_BITS
),
_mm_srai_epi32
(
v_X1
,
INTER_BITS
));
v_X1
=
_mm_packs_epi32
(
_mm_srai_epi32
(
v_X2
,
INTER_BITS
),
_mm_srai_epi32
(
v_X3
,
INTER_BITS
));
v_Y0
=
_mm_packs_epi32
(
_mm_srai_epi32
(
v_Y0
,
INTER_BITS
),
_mm_srai_epi32
(
v_Y1
,
INTER_BITS
));
v_Y1
=
_mm_packs_epi32
(
_mm_srai_epi32
(
v_Y2
,
INTER_BITS
),
_mm_srai_epi32
(
v_Y3
,
INTER_BITS
));
_mm_interleave_epi16
(
v_X0
,
v_X1
,
v_Y0
,
v_Y1
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
),
v_X0
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
+
8
),
v_X1
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
+
16
),
v_Y0
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
+
24
),
v_Y1
);
}
}
#if CV_TRY_SSE4_1
if
(
pwarp_impl_sse4
)
pwarp_impl_sse4
->
process
(
M
,
xy
,
alpha
,
X0
,
Y0
,
W0
,
bw
);
else
#endif
for
(
;
x1
<
bw
;
x1
++
)
{
double
W
=
W0
+
M
[
6
]
*
x1
;
...
...
modules/imgproc/src/imgwarp.hpp
View file @
fadf25ac
...
...
@@ -61,11 +61,27 @@ void resizeNN4_AVX2(const Range&, const Mat&, Mat&, int*, int, double);
int
warpAffineBlockline
(
int
*
adelta
,
int
*
bdelta
,
short
*
xy
,
short
*
alpha
,
int
X0
,
int
Y0
,
int
bw
);
#endif
}
namespace
opt_SSE41
namespace
opt_SSE4_1
{
#if CV_TRY_SSE4_1
void
resizeNN2_SSE4_1
(
const
Range
&
,
const
Mat
&
,
Mat
&
,
int
*
,
int
,
double
);
void
resizeNN4_SSE4_1
(
const
Range
&
,
const
Mat
&
,
Mat
&
,
int
*
,
int
,
double
);
int
VResizeLanczos4Vec_32f16u_SSE41
(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
);
void
convertMaps_nninterpolate32f1c16s_SSE41
(
const
float
*
src1f
,
const
float
*
src2f
,
short
*
dst1
,
int
width
);
void
convertMaps_32f1c16s_SSE41
(
const
float
*
src1f
,
const
float
*
src2f
,
short
*
dst1
,
ushort
*
dst2
,
int
width
);
void
convertMaps_32f2c16s_SSE41
(
const
float
*
src1f
,
short
*
dst1
,
ushort
*
dst2
,
int
width
);
void
WarpAffineInvoker_Blockline_SSE41
(
int
*
adelta
,
int
*
bdelta
,
short
*
xy
,
int
X0
,
int
Y0
,
int
bw
);
class
WarpPerspectiveLine_SSE4
{
public
:
static
Ptr
<
WarpPerspectiveLine_SSE4
>
getImpl
(
const
double
*
M
);
virtual
void
processNN
(
const
double
*
M
,
short
*
xy
,
double
X0
,
double
Y0
,
double
W0
,
int
bw
)
=
0
;
virtual
void
process
(
const
double
*
M
,
short
*
xy
,
short
*
alpha
,
double
X0
,
double
Y0
,
double
W0
,
int
bw
)
=
0
;
virtual
~
WarpPerspectiveLine_SSE4
()
{};
};
#endif
}
}
...
...
modules/imgproc/src/imgwarp.sse4_1.cpp
View file @
fadf25ac
...
...
@@ -52,7 +52,7 @@
namespace
cv
{
namespace
opt_SSE41
namespace
opt_SSE4
_
1
{
class
resizeNNInvokerSSE2
:
...
...
@@ -186,7 +186,494 @@ void resizeNN4_SSE4_1(const Range& range, const Mat& src, Mat &dst, int *x_ofs,
parallel_for_
(
range
,
invoker
,
dst
.
total
()
/
(
double
)(
1
<<
16
));
}
int
VResizeLanczos4Vec_32f16u_SSE41
(
const
uchar
**
_src
,
uchar
*
_dst
,
const
uchar
*
_beta
,
int
width
)
{
const
float
**
src
=
(
const
float
**
)
_src
;
const
float
*
beta
=
(
const
float
*
)
_beta
;
const
float
*
S0
=
src
[
0
],
*
S1
=
src
[
1
],
*
S2
=
src
[
2
],
*
S3
=
src
[
3
],
*
S4
=
src
[
4
],
*
S5
=
src
[
5
],
*
S6
=
src
[
6
],
*
S7
=
src
[
7
];
short
*
dst
=
(
short
*
)
_dst
;
int
x
=
0
;
__m128
v_b0
=
_mm_set1_ps
(
beta
[
0
]),
v_b1
=
_mm_set1_ps
(
beta
[
1
]),
v_b2
=
_mm_set1_ps
(
beta
[
2
]),
v_b3
=
_mm_set1_ps
(
beta
[
3
]),
v_b4
=
_mm_set1_ps
(
beta
[
4
]),
v_b5
=
_mm_set1_ps
(
beta
[
5
]),
v_b6
=
_mm_set1_ps
(
beta
[
6
]),
v_b7
=
_mm_set1_ps
(
beta
[
7
]);
for
(;
x
<=
width
-
8
;
x
+=
8
)
{
__m128
v_dst0
=
_mm_mul_ps
(
v_b0
,
_mm_loadu_ps
(
S0
+
x
));
v_dst0
=
_mm_add_ps
(
v_dst0
,
_mm_mul_ps
(
v_b1
,
_mm_loadu_ps
(
S1
+
x
)));
v_dst0
=
_mm_add_ps
(
v_dst0
,
_mm_mul_ps
(
v_b2
,
_mm_loadu_ps
(
S2
+
x
)));
v_dst0
=
_mm_add_ps
(
v_dst0
,
_mm_mul_ps
(
v_b3
,
_mm_loadu_ps
(
S3
+
x
)));
v_dst0
=
_mm_add_ps
(
v_dst0
,
_mm_mul_ps
(
v_b4
,
_mm_loadu_ps
(
S4
+
x
)));
v_dst0
=
_mm_add_ps
(
v_dst0
,
_mm_mul_ps
(
v_b5
,
_mm_loadu_ps
(
S5
+
x
)));
v_dst0
=
_mm_add_ps
(
v_dst0
,
_mm_mul_ps
(
v_b6
,
_mm_loadu_ps
(
S6
+
x
)));
v_dst0
=
_mm_add_ps
(
v_dst0
,
_mm_mul_ps
(
v_b7
,
_mm_loadu_ps
(
S7
+
x
)));
__m128
v_dst1
=
_mm_mul_ps
(
v_b0
,
_mm_loadu_ps
(
S0
+
x
+
4
));
v_dst1
=
_mm_add_ps
(
v_dst1
,
_mm_mul_ps
(
v_b1
,
_mm_loadu_ps
(
S1
+
x
+
4
)));
v_dst1
=
_mm_add_ps
(
v_dst1
,
_mm_mul_ps
(
v_b2
,
_mm_loadu_ps
(
S2
+
x
+
4
)));
v_dst1
=
_mm_add_ps
(
v_dst1
,
_mm_mul_ps
(
v_b3
,
_mm_loadu_ps
(
S3
+
x
+
4
)));
v_dst1
=
_mm_add_ps
(
v_dst1
,
_mm_mul_ps
(
v_b4
,
_mm_loadu_ps
(
S4
+
x
+
4
)));
v_dst1
=
_mm_add_ps
(
v_dst1
,
_mm_mul_ps
(
v_b5
,
_mm_loadu_ps
(
S5
+
x
+
4
)));
v_dst1
=
_mm_add_ps
(
v_dst1
,
_mm_mul_ps
(
v_b6
,
_mm_loadu_ps
(
S6
+
x
+
4
)));
v_dst1
=
_mm_add_ps
(
v_dst1
,
_mm_mul_ps
(
v_b7
,
_mm_loadu_ps
(
S7
+
x
+
4
)));
__m128i
v_dsti0
=
_mm_cvtps_epi32
(
v_dst0
);
__m128i
v_dsti1
=
_mm_cvtps_epi32
(
v_dst1
);
_mm_storeu_si128
((
__m128i
*
)(
dst
+
x
),
_mm_packus_epi32
(
v_dsti0
,
v_dsti1
));
}
return
x
;
}
void
convertMaps_nninterpolate32f1c16s_SSE41
(
const
float
*
src1f
,
const
float
*
src2f
,
short
*
dst1
,
int
width
)
{
int
x
=
0
;
for
(;
x
<=
width
-
16
;
x
+=
16
)
{
__m128i
v_dst0
=
_mm_packs_epi32
(
_mm_cvtps_epi32
(
_mm_loadu_ps
(
src1f
+
x
)),
_mm_cvtps_epi32
(
_mm_loadu_ps
(
src1f
+
x
+
4
)));
__m128i
v_dst1
=
_mm_packs_epi32
(
_mm_cvtps_epi32
(
_mm_loadu_ps
(
src1f
+
x
+
8
)),
_mm_cvtps_epi32
(
_mm_loadu_ps
(
src1f
+
x
+
12
)));
__m128i
v_dst2
=
_mm_packs_epi32
(
_mm_cvtps_epi32
(
_mm_loadu_ps
(
src2f
+
x
)),
_mm_cvtps_epi32
(
_mm_loadu_ps
(
src2f
+
x
+
4
)));
__m128i
v_dst3
=
_mm_packs_epi32
(
_mm_cvtps_epi32
(
_mm_loadu_ps
(
src2f
+
x
+
8
)),
_mm_cvtps_epi32
(
_mm_loadu_ps
(
src2f
+
x
+
12
)));
_mm_interleave_epi16
(
v_dst0
,
v_dst1
,
v_dst2
,
v_dst3
);
_mm_storeu_si128
((
__m128i
*
)(
dst1
+
x
*
2
),
v_dst0
);
_mm_storeu_si128
((
__m128i
*
)(
dst1
+
x
*
2
+
8
),
v_dst1
);
_mm_storeu_si128
((
__m128i
*
)(
dst1
+
x
*
2
+
16
),
v_dst2
);
_mm_storeu_si128
((
__m128i
*
)(
dst1
+
x
*
2
+
24
),
v_dst3
);
}
for
(;
x
<
width
;
x
++
)
{
dst1
[
x
*
2
]
=
saturate_cast
<
short
>
(
src1f
[
x
]);
dst1
[
x
*
2
+
1
]
=
saturate_cast
<
short
>
(
src2f
[
x
]);
}
}
void
convertMaps_32f1c16s_SSE41
(
const
float
*
src1f
,
const
float
*
src2f
,
short
*
dst1
,
ushort
*
dst2
,
int
width
)
{
int
x
=
0
;
__m128
v_its
=
_mm_set1_ps
(
INTER_TAB_SIZE
);
__m128i
v_its1
=
_mm_set1_epi32
(
INTER_TAB_SIZE
-
1
);
for
(;
x
<=
width
-
16
;
x
+=
16
)
{
__m128i
v_ix0
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
_mm_loadu_ps
(
src1f
+
x
),
v_its
));
__m128i
v_ix1
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
_mm_loadu_ps
(
src1f
+
x
+
4
),
v_its
));
__m128i
v_iy0
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
_mm_loadu_ps
(
src2f
+
x
),
v_its
));
__m128i
v_iy1
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
_mm_loadu_ps
(
src2f
+
x
+
4
),
v_its
));
__m128i
v_dst10
=
_mm_packs_epi32
(
_mm_srai_epi32
(
v_ix0
,
INTER_BITS
),
_mm_srai_epi32
(
v_ix1
,
INTER_BITS
));
__m128i
v_dst12
=
_mm_packs_epi32
(
_mm_srai_epi32
(
v_iy0
,
INTER_BITS
),
_mm_srai_epi32
(
v_iy1
,
INTER_BITS
));
__m128i
v_dst20
=
_mm_add_epi32
(
_mm_slli_epi32
(
_mm_and_si128
(
v_iy0
,
v_its1
),
INTER_BITS
),
_mm_and_si128
(
v_ix0
,
v_its1
));
__m128i
v_dst21
=
_mm_add_epi32
(
_mm_slli_epi32
(
_mm_and_si128
(
v_iy1
,
v_its1
),
INTER_BITS
),
_mm_and_si128
(
v_ix1
,
v_its1
));
_mm_storeu_si128
((
__m128i
*
)(
dst2
+
x
),
_mm_packus_epi32
(
v_dst20
,
v_dst21
));
v_ix0
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
_mm_loadu_ps
(
src1f
+
x
+
8
),
v_its
));
v_ix1
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
_mm_loadu_ps
(
src1f
+
x
+
12
),
v_its
));
v_iy0
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
_mm_loadu_ps
(
src2f
+
x
+
8
),
v_its
));
v_iy1
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
_mm_loadu_ps
(
src2f
+
x
+
12
),
v_its
));
__m128i
v_dst11
=
_mm_packs_epi32
(
_mm_srai_epi32
(
v_ix0
,
INTER_BITS
),
_mm_srai_epi32
(
v_ix1
,
INTER_BITS
));
__m128i
v_dst13
=
_mm_packs_epi32
(
_mm_srai_epi32
(
v_iy0
,
INTER_BITS
),
_mm_srai_epi32
(
v_iy1
,
INTER_BITS
));
v_dst20
=
_mm_add_epi32
(
_mm_slli_epi32
(
_mm_and_si128
(
v_iy0
,
v_its1
),
INTER_BITS
),
_mm_and_si128
(
v_ix0
,
v_its1
));
v_dst21
=
_mm_add_epi32
(
_mm_slli_epi32
(
_mm_and_si128
(
v_iy1
,
v_its1
),
INTER_BITS
),
_mm_and_si128
(
v_ix1
,
v_its1
));
_mm_storeu_si128
((
__m128i
*
)(
dst2
+
x
+
8
),
_mm_packus_epi32
(
v_dst20
,
v_dst21
));
_mm_interleave_epi16
(
v_dst10
,
v_dst11
,
v_dst12
,
v_dst13
);
_mm_storeu_si128
((
__m128i
*
)(
dst1
+
x
*
2
),
v_dst10
);
_mm_storeu_si128
((
__m128i
*
)(
dst1
+
x
*
2
+
8
),
v_dst11
);
_mm_storeu_si128
((
__m128i
*
)(
dst1
+
x
*
2
+
16
),
v_dst12
);
_mm_storeu_si128
((
__m128i
*
)(
dst1
+
x
*
2
+
24
),
v_dst13
);
}
for
(;
x
<
width
;
x
++
)
{
int
ix
=
saturate_cast
<
int
>
(
src1f
[
x
]
*
INTER_TAB_SIZE
);
int
iy
=
saturate_cast
<
int
>
(
src2f
[
x
]
*
INTER_TAB_SIZE
);
dst1
[
x
*
2
]
=
saturate_cast
<
short
>
(
ix
>>
INTER_BITS
);
dst1
[
x
*
2
+
1
]
=
saturate_cast
<
short
>
(
iy
>>
INTER_BITS
);
dst2
[
x
]
=
(
ushort
)((
iy
&
(
INTER_TAB_SIZE
-
1
))
*
INTER_TAB_SIZE
+
(
ix
&
(
INTER_TAB_SIZE
-
1
)));
}
}
void
convertMaps_32f2c16s_SSE41
(
const
float
*
src1f
,
short
*
dst1
,
ushort
*
dst2
,
int
width
)
{
int
x
=
0
;
__m128
v_its
=
_mm_set1_ps
(
INTER_TAB_SIZE
);
__m128i
v_its1
=
_mm_set1_epi32
(
INTER_TAB_SIZE
-
1
);
__m128i
v_y_mask
=
_mm_set1_epi32
((
INTER_TAB_SIZE
-
1
)
<<
16
);
for
(;
x
<=
width
-
4
;
x
+=
4
)
{
__m128i
v_src0
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
_mm_loadu_ps
(
src1f
+
x
*
2
),
v_its
));
__m128i
v_src1
=
_mm_cvtps_epi32
(
_mm_mul_ps
(
_mm_loadu_ps
(
src1f
+
x
*
2
+
4
),
v_its
));
__m128i
v_dst1
=
_mm_packs_epi32
(
_mm_srai_epi32
(
v_src0
,
INTER_BITS
),
_mm_srai_epi32
(
v_src1
,
INTER_BITS
));
_mm_storeu_si128
((
__m128i
*
)(
dst1
+
x
*
2
),
v_dst1
);
// x0 y0 x1 y1 . . .
v_src0
=
_mm_packs_epi32
(
_mm_and_si128
(
v_src0
,
v_its1
),
_mm_and_si128
(
v_src1
,
v_its1
));
__m128i
v_dst2
=
_mm_or_si128
(
_mm_srli_epi32
(
_mm_and_si128
(
v_src0
,
v_y_mask
),
16
-
INTER_BITS
),
// y0 0 y1 0 . . .
_mm_and_si128
(
v_src0
,
v_its1
));
// 0 x0 0 x1 . . .
_mm_storel_epi64
((
__m128i
*
)(
dst2
+
x
),
_mm_packus_epi32
(
v_dst2
,
v_dst2
));
}
for
(;
x
<
width
;
x
++
)
{
int
ix
=
saturate_cast
<
int
>
(
src1f
[
x
*
2
]
*
INTER_TAB_SIZE
);
int
iy
=
saturate_cast
<
int
>
(
src1f
[
x
*
2
+
1
]
*
INTER_TAB_SIZE
);
dst1
[
x
*
2
]
=
saturate_cast
<
short
>
(
ix
>>
INTER_BITS
);
dst1
[
x
*
2
+
1
]
=
saturate_cast
<
short
>
(
iy
>>
INTER_BITS
);
dst2
[
x
]
=
(
ushort
)((
iy
&
(
INTER_TAB_SIZE
-
1
))
*
INTER_TAB_SIZE
+
(
ix
&
(
INTER_TAB_SIZE
-
1
)));
}
}
void
WarpAffineInvoker_Blockline_SSE41
(
int
*
adelta
,
int
*
bdelta
,
short
*
xy
,
int
X0
,
int
Y0
,
int
bw
)
{
const
int
AB_BITS
=
MAX
(
10
,
(
int
)
INTER_BITS
);
int
x1
=
0
;
__m128i
v_X0
=
_mm_set1_epi32
(
X0
);
__m128i
v_Y0
=
_mm_set1_epi32
(
Y0
);
for
(;
x1
<=
bw
-
16
;
x1
+=
16
)
{
__m128i
v_x0
=
_mm_packs_epi32
(
_mm_srai_epi32
(
_mm_add_epi32
(
v_X0
,
_mm_loadu_si128
((
__m128i
const
*
)(
adelta
+
x1
))),
AB_BITS
),
_mm_srai_epi32
(
_mm_add_epi32
(
v_X0
,
_mm_loadu_si128
((
__m128i
const
*
)(
adelta
+
x1
+
4
))),
AB_BITS
));
__m128i
v_x1
=
_mm_packs_epi32
(
_mm_srai_epi32
(
_mm_add_epi32
(
v_X0
,
_mm_loadu_si128
((
__m128i
const
*
)(
adelta
+
x1
+
8
))),
AB_BITS
),
_mm_srai_epi32
(
_mm_add_epi32
(
v_X0
,
_mm_loadu_si128
((
__m128i
const
*
)(
adelta
+
x1
+
12
))),
AB_BITS
));
__m128i
v_y0
=
_mm_packs_epi32
(
_mm_srai_epi32
(
_mm_add_epi32
(
v_Y0
,
_mm_loadu_si128
((
__m128i
const
*
)(
bdelta
+
x1
))),
AB_BITS
),
_mm_srai_epi32
(
_mm_add_epi32
(
v_Y0
,
_mm_loadu_si128
((
__m128i
const
*
)(
bdelta
+
x1
+
4
))),
AB_BITS
));
__m128i
v_y1
=
_mm_packs_epi32
(
_mm_srai_epi32
(
_mm_add_epi32
(
v_Y0
,
_mm_loadu_si128
((
__m128i
const
*
)(
bdelta
+
x1
+
8
))),
AB_BITS
),
_mm_srai_epi32
(
_mm_add_epi32
(
v_Y0
,
_mm_loadu_si128
((
__m128i
const
*
)(
bdelta
+
x1
+
12
))),
AB_BITS
));
_mm_interleave_epi16
(
v_x0
,
v_x1
,
v_y0
,
v_y1
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
),
v_x0
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
+
8
),
v_x1
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
+
16
),
v_y0
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
+
24
),
v_y1
);
}
for
(;
x1
<
bw
;
x1
++
)
{
int
X
=
(
X0
+
adelta
[
x1
])
>>
AB_BITS
;
int
Y
=
(
Y0
+
bdelta
[
x1
])
>>
AB_BITS
;
xy
[
x1
*
2
]
=
saturate_cast
<
short
>
(
X
);
xy
[
x1
*
2
+
1
]
=
saturate_cast
<
short
>
(
Y
);
}
}
class
WarpPerspectiveLine_SSE4_Impl
:
public
WarpPerspectiveLine_SSE4
{
public
:
WarpPerspectiveLine_SSE4_Impl
(
const
double
*
M
)
{
v_M0
=
_mm_set1_pd
(
M
[
0
]);
v_M3
=
_mm_set1_pd
(
M
[
3
]);
v_M6
=
_mm_set1_pd
(
M
[
6
]);
v_intmax
=
_mm_set1_pd
((
double
)
INT_MAX
);
v_intmin
=
_mm_set1_pd
((
double
)
INT_MIN
);
v_2
=
_mm_set1_pd
(
2
);
v_zero
=
_mm_setzero_pd
();
v_1
=
_mm_set1_pd
(
1
);
v_its
=
_mm_set1_pd
(
INTER_TAB_SIZE
);
v_itsi1
=
_mm_set1_epi32
(
INTER_TAB_SIZE
-
1
);
}
virtual
void
processNN
(
const
double
*
M
,
short
*
xy
,
double
X0
,
double
Y0
,
double
W0
,
int
bw
)
{
int
x1
=
0
;
__m128d
v_X0d
=
_mm_set1_pd
(
X0
);
__m128d
v_Y0d
=
_mm_set1_pd
(
Y0
);
__m128d
v_W0
=
_mm_set1_pd
(
W0
);
__m128d
v_x1
=
_mm_set_pd
(
1
,
0
);
for
(;
x1
<=
bw
-
16
;
x1
+=
16
)
{
// 0-3
__m128i
v_X0
,
v_Y0
;
{
__m128d
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_1
,
v_W
));
__m128d
v_fX0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_1
,
v_W
));
__m128d
v_fX1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_X0
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX1
))));
v_Y0
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY1
))));
}
// 4-8
__m128i
v_X1
,
v_Y1
;
{
__m128d
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_1
,
v_W
));
__m128d
v_fX0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_1
,
v_W
));
__m128d
v_fX1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_X1
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX1
))));
v_Y1
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY1
))));
}
// 8-11
__m128i
v_X2
,
v_Y2
;
{
__m128d
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_1
,
v_W
));
__m128d
v_fX0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_1
,
v_W
));
__m128d
v_fX1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_X2
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX1
))));
v_Y2
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY1
))));
}
// 12-15
__m128i
v_X3
,
v_Y3
;
{
__m128d
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_1
,
v_W
));
__m128d
v_fX0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_1
,
v_W
));
__m128d
v_fX1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_X3
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX1
))));
v_Y3
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY1
))));
}
// convert to 16s
v_X0
=
_mm_packs_epi32
(
v_X0
,
v_X1
);
v_X1
=
_mm_packs_epi32
(
v_X2
,
v_X3
);
v_Y0
=
_mm_packs_epi32
(
v_Y0
,
v_Y1
);
v_Y1
=
_mm_packs_epi32
(
v_Y2
,
v_Y3
);
_mm_interleave_epi16
(
v_X0
,
v_X1
,
v_Y0
,
v_Y1
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
),
v_X0
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
+
8
),
v_X1
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
+
16
),
v_Y0
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
+
24
),
v_Y1
);
}
for
(;
x1
<
bw
;
x1
++
)
{
double
W
=
W0
+
M
[
6
]
*
x1
;
W
=
W
?
1.
/
W
:
0
;
double
fX
=
std
::
max
((
double
)
INT_MIN
,
std
::
min
((
double
)
INT_MAX
,
(
X0
+
M
[
0
]
*
x1
)
*
W
));
double
fY
=
std
::
max
((
double
)
INT_MIN
,
std
::
min
((
double
)
INT_MAX
,
(
Y0
+
M
[
3
]
*
x1
)
*
W
));
int
X
=
saturate_cast
<
int
>
(
fX
);
int
Y
=
saturate_cast
<
int
>
(
fY
);
xy
[
x1
*
2
]
=
saturate_cast
<
short
>
(
X
);
xy
[
x1
*
2
+
1
]
=
saturate_cast
<
short
>
(
Y
);
}
}
virtual
void
process
(
const
double
*
M
,
short
*
xy
,
short
*
alpha
,
double
X0
,
double
Y0
,
double
W0
,
int
bw
)
{
int
x1
=
0
;
__m128d
v_X0d
=
_mm_set1_pd
(
X0
);
__m128d
v_Y0d
=
_mm_set1_pd
(
Y0
);
__m128d
v_W0
=
_mm_set1_pd
(
W0
);
__m128d
v_x1
=
_mm_set_pd
(
1
,
0
);
for
(;
x1
<=
bw
-
16
;
x1
+=
16
)
{
// 0-3
__m128i
v_X0
,
v_Y0
;
{
__m128d
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_its
,
v_W
));
__m128d
v_fX0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_its
,
v_W
));
__m128d
v_fX1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_X0
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX1
))));
v_Y0
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY1
))));
}
// 4-8
__m128i
v_X1
,
v_Y1
;
{
__m128d
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_its
,
v_W
));
__m128d
v_fX0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_its
,
v_W
));
__m128d
v_fX1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_X1
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX1
))));
v_Y1
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY1
))));
}
// 8-11
__m128i
v_X2
,
v_Y2
;
{
__m128d
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_its
,
v_W
));
__m128d
v_fX0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_its
,
v_W
));
__m128d
v_fX1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_X2
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX1
))));
v_Y2
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY1
))));
}
// 12-15
__m128i
v_X3
,
v_Y3
;
{
__m128d
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_its
,
v_W
));
__m128d
v_fX0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY0
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_W
=
_mm_add_pd
(
_mm_mul_pd
(
v_M6
,
v_x1
),
v_W0
);
v_W
=
_mm_andnot_pd
(
_mm_cmpeq_pd
(
v_W
,
v_zero
),
_mm_div_pd
(
v_its
,
v_W
));
__m128d
v_fX1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_X0d
,
_mm_mul_pd
(
v_M0
,
v_x1
)),
v_W
)));
__m128d
v_fY1
=
_mm_max_pd
(
v_intmin
,
_mm_min_pd
(
v_intmax
,
_mm_mul_pd
(
_mm_add_pd
(
v_Y0d
,
_mm_mul_pd
(
v_M3
,
v_x1
)),
v_W
)));
v_x1
=
_mm_add_pd
(
v_x1
,
v_2
);
v_X3
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fX1
))));
v_Y3
=
_mm_castps_si128
(
_mm_movelh_ps
(
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY0
)),
_mm_castsi128_ps
(
_mm_cvtpd_epi32
(
v_fY1
))));
}
// store alpha
__m128i
v_alpha0
=
_mm_add_epi32
(
_mm_slli_epi32
(
_mm_and_si128
(
v_Y0
,
v_itsi1
),
INTER_BITS
),
_mm_and_si128
(
v_X0
,
v_itsi1
));
__m128i
v_alpha1
=
_mm_add_epi32
(
_mm_slli_epi32
(
_mm_and_si128
(
v_Y1
,
v_itsi1
),
INTER_BITS
),
_mm_and_si128
(
v_X1
,
v_itsi1
));
_mm_storeu_si128
((
__m128i
*
)(
alpha
+
x1
),
_mm_packs_epi32
(
v_alpha0
,
v_alpha1
));
v_alpha0
=
_mm_add_epi32
(
_mm_slli_epi32
(
_mm_and_si128
(
v_Y2
,
v_itsi1
),
INTER_BITS
),
_mm_and_si128
(
v_X2
,
v_itsi1
));
v_alpha1
=
_mm_add_epi32
(
_mm_slli_epi32
(
_mm_and_si128
(
v_Y3
,
v_itsi1
),
INTER_BITS
),
_mm_and_si128
(
v_X3
,
v_itsi1
));
_mm_storeu_si128
((
__m128i
*
)(
alpha
+
x1
+
8
),
_mm_packs_epi32
(
v_alpha0
,
v_alpha1
));
// convert to 16s
v_X0
=
_mm_packs_epi32
(
_mm_srai_epi32
(
v_X0
,
INTER_BITS
),
_mm_srai_epi32
(
v_X1
,
INTER_BITS
));
v_X1
=
_mm_packs_epi32
(
_mm_srai_epi32
(
v_X2
,
INTER_BITS
),
_mm_srai_epi32
(
v_X3
,
INTER_BITS
));
v_Y0
=
_mm_packs_epi32
(
_mm_srai_epi32
(
v_Y0
,
INTER_BITS
),
_mm_srai_epi32
(
v_Y1
,
INTER_BITS
));
v_Y1
=
_mm_packs_epi32
(
_mm_srai_epi32
(
v_Y2
,
INTER_BITS
),
_mm_srai_epi32
(
v_Y3
,
INTER_BITS
));
_mm_interleave_epi16
(
v_X0
,
v_X1
,
v_Y0
,
v_Y1
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
),
v_X0
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
+
8
),
v_X1
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
+
16
),
v_Y0
);
_mm_storeu_si128
((
__m128i
*
)(
xy
+
x1
*
2
+
24
),
v_Y1
);
}
for
(;
x1
<
bw
;
x1
++
)
{
double
W
=
W0
+
M
[
6
]
*
x1
;
W
=
W
?
INTER_TAB_SIZE
/
W
:
0
;
double
fX
=
std
::
max
((
double
)
INT_MIN
,
std
::
min
((
double
)
INT_MAX
,
(
X0
+
M
[
0
]
*
x1
)
*
W
));
double
fY
=
std
::
max
((
double
)
INT_MIN
,
std
::
min
((
double
)
INT_MAX
,
(
Y0
+
M
[
3
]
*
x1
)
*
W
));
int
X
=
saturate_cast
<
int
>
(
fX
);
int
Y
=
saturate_cast
<
int
>
(
fY
);
xy
[
x1
*
2
]
=
saturate_cast
<
short
>
(
X
>>
INTER_BITS
);
xy
[
x1
*
2
+
1
]
=
saturate_cast
<
short
>
(
Y
>>
INTER_BITS
);
alpha
[
x1
]
=
(
short
)((
Y
&
(
INTER_TAB_SIZE
-
1
))
*
INTER_TAB_SIZE
+
(
X
&
(
INTER_TAB_SIZE
-
1
)));
}
}
virtual
~
WarpPerspectiveLine_SSE4_Impl
()
{};
private
:
__m128d
v_M0
;
__m128d
v_M3
;
__m128d
v_M6
;
__m128d
v_intmax
;
__m128d
v_intmin
;
__m128d
v_2
,
v_zero
,
v_1
,
v_its
;
__m128i
v_itsi1
;
};
Ptr
<
WarpPerspectiveLine_SSE4
>
WarpPerspectiveLine_SSE4
::
getImpl
(
const
double
*
M
)
{
return
Ptr
<
WarpPerspectiveLine_SSE4
>
(
new
WarpPerspectiveLine_SSE4_Impl
(
M
));
}
}
}
/* End of file. */
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment