Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
a25dba00
Commit
a25dba00
authored
Jul 17, 2016
by
k-shinotsuka
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
improve SymmRowSmallVec_8u32s().
parent
69c4e84d
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
117 additions
and
143 deletions
+117
-143
filter.cpp
modules/imgproc/src/filter.cpp
+117
-143
No files found.
modules/imgproc/src/filter.cpp
View file @
a25dba00
...
@@ -652,41 +652,30 @@ struct SymmRowSmallVec_8u32s
...
@@ -652,41 +652,30 @@ struct SymmRowSmallVec_8u32s
{
{
__m128i
k0
=
_mm_shuffle_epi32
(
_mm_cvtsi32_si128
(
kx
[
0
]),
0
),
__m128i
k0
=
_mm_shuffle_epi32
(
_mm_cvtsi32_si128
(
kx
[
0
]),
0
),
k1
=
_mm_shuffle_epi32
(
_mm_cvtsi32_si128
(
kx
[
1
]),
0
);
k1
=
_mm_shuffle_epi32
(
_mm_cvtsi32_si128
(
kx
[
1
]),
0
);
k0
=
_mm_packs_epi32
(
k0
,
k0
);
k1
=
_mm_packs_epi32
(
k1
,
k1
);
k1
=
_mm_packs_epi32
(
k1
,
k1
);
for
(
;
i
<=
width
-
16
;
i
+=
16
,
src
+=
16
)
for
(
;
i
<=
width
-
8
;
i
+=
8
,
src
+=
8
)
{
{
__m128i
x0
,
x1
,
x2
,
y0
,
y1
,
t0
,
t1
,
z0
,
z1
,
z2
,
z3
;
__m128i
x0
=
_mm_loadl_epi64
((
__m128i
*
)(
src
-
cn
));
x0
=
_mm_loadu_si128
((
__m128i
*
)(
src
-
cn
));
__m128i
x1
=
_mm_loadl_epi64
((
__m128i
*
)
src
);
x1
=
_mm_loadu_si128
((
__m128i
*
)
src
);
__m128i
x2
=
_mm_loadl_epi64
((
__m128i
*
)(
src
+
cn
));
x2
=
_mm_loadu_si128
((
__m128i
*
)(
src
+
cn
));
y0
=
_mm_add_epi16
(
_mm_unpackhi_epi8
(
x0
,
z
),
_mm_unpackhi_epi8
(
x2
,
z
));
x0
=
_mm_add_epi16
(
_mm_unpacklo_epi8
(
x0
,
z
),
_mm_unpacklo_epi8
(
x2
,
z
));
y1
=
_mm_unpackhi_epi8
(
x1
,
z
);
x1
=
_mm_unpacklo_epi8
(
x1
,
z
);
t1
=
_mm_mulhi_epi16
(
x1
,
k0
);
x0
=
_mm_unpacklo_epi8
(
x0
,
z
);
t0
=
_mm_mullo_epi16
(
x1
,
k0
);
x1
=
_mm_unpacklo_epi8
(
x1
,
z
);
x2
=
_mm_mulhi_epi16
(
x0
,
k1
);
x2
=
_mm_unpacklo_epi8
(
x2
,
z
);
x0
=
_mm_mullo_epi16
(
x0
,
k1
);
__m128i
x3
=
_mm_unpacklo_epi16
(
x0
,
x2
);
z0
=
_mm_unpacklo_epi16
(
t0
,
t1
);
__m128i
x4
=
_mm_unpackhi_epi16
(
x0
,
x2
);
z1
=
_mm_unpackhi_epi16
(
t0
,
t1
);
__m128i
x5
=
_mm_unpacklo_epi16
(
x1
,
z
);
z0
=
_mm_add_epi32
(
z0
,
_mm_unpacklo_epi16
(
x0
,
x2
));
__m128i
x6
=
_mm_unpackhi_epi16
(
x1
,
z
);
z1
=
_mm_add_epi32
(
z1
,
_mm_unpackhi_epi16
(
x0
,
x2
));
x3
=
_mm_madd_epi16
(
x3
,
k1
);
x4
=
_mm_madd_epi16
(
x4
,
k1
);
t1
=
_mm_mulhi_epi16
(
y1
,
k0
);
x5
=
_mm_madd_epi16
(
x5
,
k0
);
t0
=
_mm_mullo_epi16
(
y1
,
k0
);
x6
=
_mm_madd_epi16
(
x6
,
k0
);
y1
=
_mm_mulhi_epi16
(
y0
,
k1
);
x3
=
_mm_add_epi32
(
x3
,
x5
);
y0
=
_mm_mullo_epi16
(
y0
,
k1
);
x4
=
_mm_add_epi32
(
x4
,
x6
);
z2
=
_mm_unpacklo_epi16
(
t0
,
t1
);
z3
=
_mm_unpackhi_epi16
(
t0
,
t1
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
),
x3
);
z2
=
_mm_add_epi32
(
z2
,
_mm_unpacklo_epi16
(
y0
,
y1
));
_mm_store_si128
((
__m128i
*
)(
dst
+
i
+
4
),
x4
);
z3
=
_mm_add_epi32
(
z3
,
_mm_unpackhi_epi16
(
y0
,
y1
));
_mm_store_si128
((
__m128i
*
)(
dst
+
i
),
z0
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
+
4
),
z1
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
+
8
),
z2
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
+
12
),
z3
);
}
}
}
}
}
}
...
@@ -717,57 +706,45 @@ struct SymmRowSmallVec_8u32s
...
@@ -717,57 +706,45 @@ struct SymmRowSmallVec_8u32s
__m128i
k0
=
_mm_shuffle_epi32
(
_mm_cvtsi32_si128
(
kx
[
0
]),
0
),
__m128i
k0
=
_mm_shuffle_epi32
(
_mm_cvtsi32_si128
(
kx
[
0
]),
0
),
k1
=
_mm_shuffle_epi32
(
_mm_cvtsi32_si128
(
kx
[
1
]),
0
),
k1
=
_mm_shuffle_epi32
(
_mm_cvtsi32_si128
(
kx
[
1
]),
0
),
k2
=
_mm_shuffle_epi32
(
_mm_cvtsi32_si128
(
kx
[
2
]),
0
);
k2
=
_mm_shuffle_epi32
(
_mm_cvtsi32_si128
(
kx
[
2
]),
0
);
k0
=
_mm_packs_epi32
(
k0
,
k0
);
k1
=
_mm_packs_epi32
(
k1
,
k1
);
k1
=
_mm_packs_epi32
(
k1
,
k1
);
k2
=
_mm_packs_epi32
(
k2
,
k2
);
k2
=
_mm_packs_epi32
(
k2
,
k2
);
for
(
;
i
<=
width
-
16
;
i
+=
16
,
src
+=
16
)
for
(
;
i
<=
width
-
8
;
i
+=
8
,
src
+=
8
)
{
{
__m128i
x0
,
x1
,
x2
,
y0
,
y1
,
t0
,
t1
,
z0
,
z1
,
z2
,
z3
;
__m128i
x0
=
_mm_loadl_epi64
((
__m128i
*
)
src
);
x0
=
_mm_loadu_si128
((
__m128i
*
)(
src
-
cn
));
x1
=
_mm_loadu_si128
((
__m128i
*
)
src
);
x2
=
_mm_loadu_si128
((
__m128i
*
)(
src
+
cn
));
y0
=
_mm_add_epi16
(
_mm_unpackhi_epi8
(
x0
,
z
),
_mm_unpackhi_epi8
(
x2
,
z
));
x0
=
_mm_add_epi16
(
_mm_unpacklo_epi8
(
x0
,
z
),
_mm_unpacklo_epi8
(
x2
,
z
));
y1
=
_mm_unpackhi_epi8
(
x1
,
z
);
x1
=
_mm_unpacklo_epi8
(
x1
,
z
);
t1
=
_mm_mulhi_epi16
(
x1
,
k0
);
t0
=
_mm_mullo_epi16
(
x1
,
k0
);
x2
=
_mm_mulhi_epi16
(
x0
,
k1
);
x0
=
_mm_mullo_epi16
(
x0
,
k1
);
z0
=
_mm_unpacklo_epi16
(
t0
,
t1
);
z1
=
_mm_unpackhi_epi16
(
t0
,
t1
);
z0
=
_mm_add_epi32
(
z0
,
_mm_unpacklo_epi16
(
x0
,
x2
));
z1
=
_mm_add_epi32
(
z1
,
_mm_unpackhi_epi16
(
x0
,
x2
));
t1
=
_mm_mulhi_epi16
(
y1
,
k0
);
t0
=
_mm_mullo_epi16
(
y1
,
k0
);
y1
=
_mm_mulhi_epi16
(
y0
,
k1
);
y0
=
_mm_mullo_epi16
(
y0
,
k1
);
z2
=
_mm_unpacklo_epi16
(
t0
,
t1
);
z3
=
_mm_unpackhi_epi16
(
t0
,
t1
);
z2
=
_mm_add_epi32
(
z2
,
_mm_unpacklo_epi16
(
y0
,
y1
));
z3
=
_mm_add_epi32
(
z3
,
_mm_unpackhi_epi16
(
y0
,
y1
));
x0
=
_mm_loadu_si128
((
__m128i
*
)(
src
-
cn
*
2
));
x0
=
_mm_unpacklo_epi8
(
x0
,
z
);
x1
=
_mm_loadu_si128
((
__m128i
*
)(
src
+
cn
*
2
));
__m128i
x1
=
_mm_unpacklo_epi16
(
x0
,
z
);
y1
=
_mm_add_epi16
(
_mm_unpackhi_epi8
(
x0
,
z
),
_mm_unpackhi_epi8
(
x1
,
z
));
__m128i
x2
=
_mm_unpackhi_epi16
(
x0
,
z
);
y0
=
_mm_add_epi16
(
_mm_unpacklo_epi8
(
x0
,
z
),
_mm_unpacklo_epi8
(
x1
,
z
));
x1
=
_mm_madd_epi16
(
x1
,
k0
);
x2
=
_mm_madd_epi16
(
x2
,
k0
);
t1
=
_mm_mulhi_epi16
(
y0
,
k2
);
t0
=
_mm_mullo_epi16
(
y0
,
k2
);
__m128i
x3
=
_mm_loadl_epi64
((
__m128i
*
)(
src
-
cn
));
y0
=
_mm_mullo_epi16
(
y1
,
k2
);
__m128i
x4
=
_mm_loadl_epi64
((
__m128i
*
)(
src
+
cn
));
y1
=
_mm_mulhi_epi16
(
y1
,
k2
);
z0
=
_mm_add_epi32
(
z0
,
_mm_unpacklo_epi16
(
t0
,
t1
));
x3
=
_mm_unpacklo_epi8
(
x3
,
z
);
z1
=
_mm_add_epi32
(
z1
,
_mm_unpackhi_epi16
(
t0
,
t1
));
x4
=
_mm_unpacklo_epi8
(
x4
,
z
);
z2
=
_mm_add_epi32
(
z2
,
_mm_unpacklo_epi16
(
y0
,
y1
));
__m128i
x5
=
_mm_unpacklo_epi16
(
x3
,
x4
);
z3
=
_mm_add_epi32
(
z3
,
_mm_unpackhi_epi16
(
y0
,
y1
));
__m128i
x6
=
_mm_unpackhi_epi16
(
x3
,
x4
);
x5
=
_mm_madd_epi16
(
x5
,
k1
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
),
z0
);
x6
=
_mm_madd_epi16
(
x6
,
k1
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
+
4
),
z1
);
x1
=
_mm_add_epi32
(
x1
,
x5
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
+
8
),
z2
);
x2
=
_mm_add_epi32
(
x2
,
x6
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
+
12
),
z3
);
x3
=
_mm_loadl_epi64
((
__m128i
*
)(
src
-
cn
*
2
));
x4
=
_mm_loadl_epi64
((
__m128i
*
)(
src
+
cn
*
2
));
x3
=
_mm_unpacklo_epi8
(
x3
,
z
);
x4
=
_mm_unpacklo_epi8
(
x4
,
z
);
x5
=
_mm_unpacklo_epi16
(
x3
,
x4
);
x6
=
_mm_unpackhi_epi16
(
x3
,
x4
);
x5
=
_mm_madd_epi16
(
x5
,
k2
);
x6
=
_mm_madd_epi16
(
x6
,
k2
);
x1
=
_mm_add_epi32
(
x1
,
x5
);
x2
=
_mm_add_epi32
(
x2
,
x6
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
),
x1
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
+
4
),
x2
);
}
}
}
}
}
}
...
@@ -791,77 +768,75 @@ struct SymmRowSmallVec_8u32s
...
@@ -791,77 +768,75 @@ struct SymmRowSmallVec_8u32s
}
}
else
else
{
{
__m128i
k
1
=
_mm_shuffle_epi32
(
_mm_cvtsi32_si128
(
kx
[
1
]),
0
);
__m128i
k
0
=
_mm_set_epi32
(
-
kx
[
1
],
kx
[
1
],
-
kx
[
1
],
kx
[
1
]
);
k
1
=
_mm_packs_epi32
(
k1
,
k1
);
k
0
=
_mm_packs_epi32
(
k0
,
k0
);
for
(
;
i
<=
width
-
16
;
i
+=
16
,
src
+=
16
)
for
(
;
i
<=
width
-
16
;
i
+=
16
,
src
+=
16
)
{
{
__m128i
x0
,
x1
,
y0
,
y1
,
z0
,
z1
,
z2
,
z3
;
__m128i
x0
=
_mm_loadu_si128
((
__m128i
*
)(
src
+
cn
));
x0
=
_mm_loadu_si128
((
__m128i
*
)(
src
+
cn
));
__m128i
x1
=
_mm_loadu_si128
((
__m128i
*
)(
src
-
cn
));
x1
=
_mm_loadu_si128
((
__m128i
*
)(
src
-
cn
));
y0
=
_mm_sub_epi16
(
_mm_unpackhi_epi8
(
x0
,
z
),
_mm_unpackhi_epi8
(
x1
,
z
));
__m128i
x2
=
_mm_unpacklo_epi8
(
x0
,
z
);
x0
=
_mm_sub_epi16
(
_mm_unpacklo_epi8
(
x0
,
z
),
_mm_unpacklo_epi8
(
x1
,
z
));
__m128i
x3
=
_mm_unpacklo_epi8
(
x1
,
z
);
__m128i
x4
=
_mm_unpackhi_epi8
(
x0
,
z
);
x1
=
_mm_mulhi_epi16
(
x0
,
k1
);
__m128i
x5
=
_mm_unpackhi_epi8
(
x1
,
z
);
x0
=
_mm_mullo_epi16
(
x0
,
k1
);
__m128i
x6
=
_mm_unpacklo_epi16
(
x2
,
x3
);
z0
=
_mm_unpacklo_epi16
(
x0
,
x1
);
__m128i
x7
=
_mm_unpacklo_epi16
(
x4
,
x5
);
z1
=
_mm_unpackhi_epi16
(
x0
,
x1
);
__m128i
x8
=
_mm_unpackhi_epi16
(
x2
,
x3
);
__m128i
x9
=
_mm_unpackhi_epi16
(
x4
,
x5
);
y1
=
_mm_mulhi_epi16
(
y0
,
k1
);
x6
=
_mm_madd_epi16
(
x6
,
k0
);
y0
=
_mm_mullo_epi16
(
y0
,
k1
);
x7
=
_mm_madd_epi16
(
x7
,
k0
);
z2
=
_mm_unpacklo_epi16
(
y0
,
y1
);
x8
=
_mm_madd_epi16
(
x8
,
k0
);
z3
=
_mm_unpackhi_epi16
(
y0
,
y1
);
x9
=
_mm_madd_epi16
(
x9
,
k0
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
),
z0
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
+
4
),
z1
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
),
x6
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
+
8
),
z2
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
+
4
),
x8
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
+
12
),
z3
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
+
8
),
x7
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
+
12
),
x9
);
}
}
}
}
}
}
else
if
(
_ksize
==
5
)
else
if
(
_ksize
==
5
)
{
{
__m128i
k0
=
_mm_shuffle_epi32
(
_mm_cvtsi32_si128
(
kx
[
0
]),
0
),
__m128i
k0
=
_mm_loadl_epi64
((
__m128i
*
)(
kx
+
1
));
k1
=
_mm_shuffle_epi32
(
_mm_cvtsi32_si128
(
kx
[
1
]),
0
),
k0
=
_mm_unpacklo_epi64
(
k0
,
k0
);
k2
=
_mm_shuffle_epi32
(
_mm_cvtsi32_si128
(
kx
[
2
]),
0
);
k0
=
_mm_packs_epi32
(
k0
,
k0
);
k0
=
_mm_packs_epi32
(
k0
,
k0
);
k1
=
_mm_packs_epi32
(
k1
,
k1
);
k2
=
_mm_packs_epi32
(
k2
,
k2
);
for
(
;
i
<=
width
-
16
;
i
+=
16
,
src
+=
16
)
for
(
;
i
<=
width
-
16
;
i
+=
16
,
src
+=
16
)
{
{
__m128i
x0
,
x1
,
x2
,
y0
,
y1
,
t0
,
t1
,
z0
,
z1
,
z2
,
z3
;
__m128i
x0
=
_mm_loadu_si128
((
__m128i
*
)(
src
+
cn
))
;
x0
=
_mm_loadu_si128
((
__m128i
*
)(
src
+
cn
));
__m128i
x1
=
_mm_loadu_si128
((
__m128i
*
)(
src
-
cn
));
x2
=
_mm_loadu_si128
((
__m128i
*
)(
src
-
cn
));
y0
=
_mm_sub_epi16
(
_mm_unpackhi_epi8
(
x0
,
z
),
_mm_unpackhi_epi8
(
x2
,
z
)
);
__m128i
x2
=
_mm_unpackhi_epi8
(
x0
,
z
);
x0
=
_mm_sub_epi16
(
_mm_unpacklo_epi8
(
x0
,
z
),
_mm_unpacklo_epi8
(
x2
,
z
)
);
__m128i
x3
=
_mm_unpackhi_epi8
(
x1
,
z
);
x0
=
_mm_unpacklo_epi8
(
x0
,
z
);
x
2
=
_mm_mulhi_epi16
(
x0
,
k1
);
x
1
=
_mm_unpacklo_epi8
(
x1
,
z
);
x0
=
_mm_mullo_epi16
(
x0
,
k1
);
__m128i
x5
=
_mm_sub_epi16
(
x2
,
x3
);
z0
=
_mm_unpacklo_epi16
(
x0
,
x2
);
__m128i
x4
=
_mm_sub_epi16
(
x0
,
x1
);
z1
=
_mm_unpackhi_epi16
(
x0
,
x2
);
y1
=
_mm_mulhi_epi16
(
y0
,
k1
);
__m128i
x6
=
_mm_loadu_si128
((
__m128i
*
)(
src
+
cn
*
2
)
);
y0
=
_mm_mullo_epi16
(
y0
,
k1
);
__m128i
x7
=
_mm_loadu_si128
((
__m128i
*
)(
src
-
cn
*
2
)
);
z2
=
_mm_unpacklo_epi16
(
y0
,
y1
);
z3
=
_mm_unpackhi_epi16
(
y0
,
y1
);
__m128i
x8
=
_mm_unpackhi_epi8
(
x6
,
z
);
__m128i
x9
=
_mm_unpackhi_epi8
(
x7
,
z
);
x
0
=
_mm_loadu_si128
((
__m128i
*
)(
src
+
cn
*
2
)
);
x
6
=
_mm_unpacklo_epi8
(
x6
,
z
);
x
1
=
_mm_loadu_si128
((
__m128i
*
)(
src
-
cn
*
2
)
);
x
7
=
_mm_unpacklo_epi8
(
x7
,
z
);
y1
=
_mm_sub_epi16
(
_mm_unpackhi_epi8
(
x0
,
z
),
_mm_unpackhi_epi8
(
x1
,
z
)
);
__m128i
x11
=
_mm_sub_epi16
(
x8
,
x9
);
y0
=
_mm_sub_epi16
(
_mm_unpacklo_epi8
(
x0
,
z
),
_mm_unpacklo_epi8
(
x1
,
z
)
);
__m128i
x10
=
_mm_sub_epi16
(
x6
,
x7
);
t1
=
_mm_mulhi_epi16
(
y0
,
k2
);
__m128i
x13
=
_mm_unpackhi_epi16
(
x5
,
x11
);
t0
=
_mm_mullo_epi16
(
y0
,
k2
);
__m128i
x12
=
_mm_unpackhi_epi16
(
x4
,
x10
);
y0
=
_mm_mullo_epi16
(
y1
,
k2
);
x5
=
_mm_unpacklo_epi16
(
x5
,
x11
);
y1
=
_mm_mulhi_epi16
(
y1
,
k2
);
x4
=
_mm_unpacklo_epi16
(
x4
,
x10
);
z0
=
_mm_add_epi32
(
z0
,
_mm_unpacklo_epi16
(
t0
,
t1
)
);
x5
=
_mm_madd_epi16
(
x5
,
k0
);
z1
=
_mm_add_epi32
(
z1
,
_mm_unpackhi_epi16
(
t0
,
t1
)
);
x4
=
_mm_madd_epi16
(
x4
,
k0
);
z2
=
_mm_add_epi32
(
z2
,
_mm_unpacklo_epi16
(
y0
,
y1
)
);
x13
=
_mm_madd_epi16
(
x13
,
k0
);
z3
=
_mm_add_epi32
(
z3
,
_mm_unpackhi_epi16
(
y0
,
y1
)
);
x12
=
_mm_madd_epi16
(
x12
,
k0
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
),
z0
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
),
x4
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
+
4
),
z1
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
+
4
),
x12
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
+
8
),
z2
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
+
8
),
x5
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
+
12
),
z
3
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
+
12
),
x1
3
);
}
}
}
}
}
}
...
@@ -870,19 +845,18 @@ struct SymmRowSmallVec_8u32s
...
@@ -870,19 +845,18 @@ struct SymmRowSmallVec_8u32s
kx
-=
_ksize
/
2
;
kx
-=
_ksize
/
2
;
for
(
;
i
<=
width
-
4
;
i
+=
4
,
src
+=
4
)
for
(
;
i
<=
width
-
4
;
i
+=
4
,
src
+=
4
)
{
{
__m128i
f
,
s0
=
z
,
x0
,
x1
;
__m128i
s0
=
z
;
for
(
k
=
j
=
0
;
k
<
_ksize
;
k
++
,
j
+=
cn
)
for
(
k
=
j
=
0
;
k
<
_ksize
;
k
++
,
j
+=
cn
)
{
{
f
=
_mm_cvtsi32_si128
(
kx
[
k
]);
__m128i
f
=
_mm_cvtsi32_si128
(
kx
[
k
]);
f
=
_mm_shuffle_epi32
(
f
,
0
);
f
=
_mm_shuffle_epi32
(
f
,
0
);
f
=
_mm_packs_epi32
(
f
,
f
);
x0
=
_mm_cvtsi32_si128
(
*
(
const
int
*
)(
src
+
j
));
__m128i
x0
=
_mm_cvtsi32_si128
(
*
(
const
int
*
)(
src
+
j
));
x0
=
_mm_unpacklo_epi8
(
x0
,
z
);
x0
=
_mm_unpacklo_epi8
(
x0
,
z
);
x
1
=
_mm_mulhi_epi16
(
x0
,
f
);
x
0
=
_mm_unpacklo_epi16
(
x0
,
z
);
x0
=
_mm_m
ullo
_epi16
(
x0
,
f
);
x0
=
_mm_m
add
_epi16
(
x0
,
f
);
s0
=
_mm_add_epi32
(
s0
,
_mm_unpacklo_epi16
(
x0
,
x1
)
);
s0
=
_mm_add_epi32
(
s0
,
x0
);
}
}
_mm_store_si128
((
__m128i
*
)(
dst
+
i
),
s0
);
_mm_store_si128
((
__m128i
*
)(
dst
+
i
),
s0
);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment