Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
124011c3
Commit
124011c3
authored
Dec 12, 2018
by
Vitaly Tuzov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Speedup filter2d by loop unrolling
parent
06f32e3b
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
454 additions
and
87 deletions
+454
-87
filter.cpp
modules/imgproc/src/filter.cpp
+454
-87
No files found.
modules/imgproc/src/filter.cpp
View file @
124011c3
...
...
@@ -587,6 +587,7 @@ struct RowVec_8u32s
i
+=
v_uint32
::
nlanes
;
}
}
vx_cleanup
();
return
i
;
}
...
...
@@ -1083,6 +1084,7 @@ struct SymmRowSmallVec_8u32s
}
}
vx_cleanup
();
return
i
;
}
...
...
@@ -1106,6 +1108,8 @@ struct SymmColumnVec_32s8u
int
operator
()(
const
uchar
**
_src
,
uchar
*
dst
,
int
width
)
const
{
int
_ksize
=
kernel
.
rows
+
kernel
.
cols
-
1
;
if
(
_ksize
==
1
)
return
0
;
int
ksize2
=
_ksize
/
2
;
const
float
*
ky
=
kernel
.
ptr
<
float
>
()
+
ksize2
;
int
i
=
0
,
k
;
...
...
@@ -1115,9 +1119,8 @@ struct SymmColumnVec_32s8u
v_float32
d4
=
vx_setall_f32
(
delta
);
if
(
symmetrical
)
{
if
(
_ksize
==
1
)
return
0
;
v_float32
f0
=
vx_setall_f32
(
ky
[
0
]);
v_float32
f1
=
vx_setall_f32
(
ky
[
1
]);
for
(
;
i
<=
width
-
v_uint8
::
nlanes
;
i
+=
v_uint8
::
nlanes
)
{
const
int
*
S
=
src
[
0
]
+
i
;
...
...
@@ -1125,11 +1128,17 @@ struct SymmColumnVec_32s8u
v_float32
s1
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S
+
v_int32
::
nlanes
)),
f0
,
d4
);
v_float32
s2
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S
+
2
*
v_int32
::
nlanes
)),
f0
,
d4
);
v_float32
s3
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S
+
3
*
v_int32
::
nlanes
)),
f0
,
d4
);
for
(
k
=
1
;
k
<=
ksize2
;
k
++
)
const
int
*
S0
=
src
[
1
]
+
i
;
const
int
*
S1
=
src
[
-
1
]
+
i
;
s0
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
)
+
vx_load
(
S1
)),
f1
,
s0
);
s1
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
+
v_int32
::
nlanes
)
+
vx_load
(
S1
+
v_int32
::
nlanes
)),
f1
,
s1
);
s2
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
+
2
*
v_int32
::
nlanes
)
+
vx_load
(
S1
+
2
*
v_int32
::
nlanes
)),
f1
,
s2
);
s3
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
+
3
*
v_int32
::
nlanes
)
+
vx_load
(
S1
+
3
*
v_int32
::
nlanes
)),
f1
,
s3
);
for
(
k
=
2
;
k
<=
ksize2
;
k
++
)
{
v_float32
f
=
vx_setall_f32
(
ky
[
k
]);
const
int
*
S0
=
src
[
k
]
+
i
;
const
int
*
S1
=
src
[
-
k
]
+
i
;
S0
=
src
[
k
]
+
i
;
S1
=
src
[
-
k
]
+
i
;
s0
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
)
+
vx_load
(
S1
)),
f
,
s0
);
s1
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
+
v_int32
::
nlanes
)
+
vx_load
(
S1
+
v_int32
::
nlanes
)),
f
,
s1
);
s2
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
+
2
*
v_int32
::
nlanes
)
+
vx_load
(
S1
+
2
*
v_int32
::
nlanes
)),
f
,
s2
);
...
...
@@ -1142,11 +1151,15 @@ struct SymmColumnVec_32s8u
const
int
*
S
=
src
[
0
]
+
i
;
v_float32
s0
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S
)),
f0
,
d4
);
v_float32
s1
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S
+
v_int32
::
nlanes
)),
f0
,
d4
);
for
(
k
=
1
;
k
<=
ksize2
;
k
++
)
const
int
*
S0
=
src
[
1
]
+
i
;
const
int
*
S1
=
src
[
-
1
]
+
i
;
s0
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
)
+
vx_load
(
S1
)),
f1
,
s0
);
s1
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
+
v_int32
::
nlanes
)
+
vx_load
(
S1
+
v_int32
::
nlanes
)),
f1
,
s1
);
for
(
k
=
2
;
k
<=
ksize2
;
k
++
)
{
v_float32
f
=
vx_setall_f32
(
ky
[
k
]);
const
int
*
S0
=
src
[
k
]
+
i
;
const
int
*
S1
=
src
[
-
k
]
+
i
;
S0
=
src
[
k
]
+
i
;
S1
=
src
[
-
k
]
+
i
;
s0
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
)
+
vx_load
(
S1
)),
f
,
s0
);
s1
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
+
v_int32
::
nlanes
)
+
vx_load
(
S1
+
v_int32
::
nlanes
)),
f
,
s1
);
}
...
...
@@ -1160,7 +1173,8 @@ struct SymmColumnVec_32s8u
#endif
{
v_float32x4
s0
=
v_muladd
(
v_cvt_f32
(
v_load
(
src
[
0
]
+
i
)),
v_setall_f32
(
ky
[
0
]),
v_setall_f32
(
delta
));
for
(
k
=
1
;
k
<=
ksize2
;
k
++
)
s0
=
v_muladd
(
v_cvt_f32
(
v_load
(
src
[
1
]
+
i
)
+
v_load
(
src
[
-
1
]
+
i
)),
v_setall_f32
(
ky
[
1
]),
s0
);
for
(
k
=
2
;
k
<=
ksize2
;
k
++
)
s0
=
v_muladd
(
v_cvt_f32
(
v_load
(
src
[
k
]
+
i
)
+
v_load
(
src
[
-
k
]
+
i
)),
v_setall_f32
(
ky
[
k
]),
s0
);
v_int32x4
s32
=
v_round
(
s0
);
v_int16x8
s16
=
v_pack
(
s32
,
s32
);
...
...
@@ -1170,17 +1184,20 @@ struct SymmColumnVec_32s8u
}
else
{
v_float32
f1
=
vx_setall_f32
(
ky
[
1
]);
for
(
;
i
<=
width
-
v_uint8
::
nlanes
;
i
+=
v_uint8
::
nlanes
)
{
v_float32
s0
=
d4
;
v_float32
s1
=
d4
;
v_float32
s2
=
d4
;
v_float32
s3
=
d4
;
for
(
k
=
1
;
k
<=
ksize2
;
k
++
)
const
int
*
S0
=
src
[
1
]
+
i
;
const
int
*
S1
=
src
[
-
1
]
+
i
;
v_float32
s0
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
)
-
vx_load
(
S1
)),
f1
,
d4
);
v_float32
s1
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
+
v_int32
::
nlanes
)
-
vx_load
(
S1
+
v_int32
::
nlanes
)),
f1
,
d4
);
v_float32
s2
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
+
2
*
v_int32
::
nlanes
)
-
vx_load
(
S1
+
2
*
v_int32
::
nlanes
)),
f1
,
d4
);
v_float32
s3
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
+
3
*
v_int32
::
nlanes
)
-
vx_load
(
S1
+
3
*
v_int32
::
nlanes
)),
f1
,
d4
);
for
(
k
=
2
;
k
<=
ksize2
;
k
++
)
{
v_float32
f
=
vx_setall_f32
(
ky
[
k
]);
const
int
*
S0
=
src
[
k
]
+
i
;
const
int
*
S1
=
src
[
-
k
]
+
i
;
S0
=
src
[
k
]
+
i
;
S1
=
src
[
-
k
]
+
i
;
s0
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
)
-
vx_load
(
S1
)),
f
,
s0
);
s1
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
+
v_int32
::
nlanes
)
-
vx_load
(
S1
+
v_int32
::
nlanes
)),
f
,
s1
);
s2
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
+
2
*
v_int32
::
nlanes
)
-
vx_load
(
S1
+
2
*
v_int32
::
nlanes
)),
f
,
s2
);
...
...
@@ -1190,13 +1207,15 @@ struct SymmColumnVec_32s8u
}
if
(
i
<=
width
-
v_uint16
::
nlanes
)
{
v_float32
s0
=
d4
;
v_float32
s1
=
d4
;
for
(
k
=
1
;
k
<=
ksize2
;
k
++
)
const
int
*
S0
=
src
[
1
]
+
i
;
const
int
*
S1
=
src
[
-
1
]
+
i
;
v_float32
s0
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
)
-
vx_load
(
S1
)),
f1
,
d4
);
v_float32
s1
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
+
v_int32
::
nlanes
)
-
vx_load
(
S1
+
v_int32
::
nlanes
)),
f1
,
d4
);
for
(
k
=
2
;
k
<=
ksize2
;
k
++
)
{
v_float32
f
=
vx_setall_f32
(
ky
[
k
]);
const
int
*
S0
=
src
[
k
]
+
i
;
const
int
*
S1
=
src
[
-
k
]
+
i
;
S0
=
src
[
k
]
+
i
;
S1
=
src
[
-
k
]
+
i
;
s0
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
)
-
vx_load
(
S1
)),
f
,
s0
);
s1
=
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
+
v_int32
::
nlanes
)
-
vx_load
(
S1
+
v_int32
::
nlanes
)),
f
,
s1
);
}
...
...
@@ -1209,8 +1228,8 @@ struct SymmColumnVec_32s8u
if
(
i
<=
width
-
v_int32x4
::
nlanes
)
#endif
{
v_float32x4
s0
=
v_
setall_f32
(
delta
);
for
(
k
=
1
;
k
<=
ksize2
;
k
++
)
v_float32x4
s0
=
v_
muladd
(
v_cvt_f32
(
v_load
(
src
[
1
]
+
i
)
-
v_load
(
src
[
-
1
]
+
i
)),
v_setall_f32
(
ky
[
1
]),
v_setall_f32
(
delta
)
);
for
(
k
=
2
;
k
<=
ksize2
;
k
++
)
s0
=
v_muladd
(
v_cvt_f32
(
v_load
(
src
[
k
]
+
i
)
-
v_load
(
src
[
-
k
]
+
i
)),
v_setall_f32
(
ky
[
k
]),
s0
);
v_int32x4
s32
=
v_round
(
s0
);
v_int16x8
s16
=
v_pack
(
s32
,
s32
);
...
...
@@ -1219,6 +1238,7 @@ struct SymmColumnVec_32s8u
}
}
vx_cleanup
();
return
i
;
}
...
...
@@ -1250,57 +1270,104 @@ struct SymmColumnSmallVec_32s16s
short
*
dst
=
(
short
*
)
_dst
;
v_float32
df4
=
vx_setall_f32
(
delta
);
v_int32
d4
=
v_round
(
df4
);
int
d
=
cvRound
(
delta
);
v_int16
d8
=
vx_setall_s16
((
short
)
d
);
if
(
symmetrical
)
{
if
(
ky
[
0
]
==
2
&&
ky
[
1
]
==
1
)
{
for
(
;
i
<=
width
-
v_int16
::
nlanes
;
i
+=
v_int16
::
nlanes
)
for
(
;
i
<=
width
-
2
*
v_int16
::
nlanes
;
i
+=
2
*
v_int16
::
nlanes
)
{
v_int32
s0
=
vx_load
(
S1
+
i
);
v_int32
s1
=
vx_load
(
S1
+
i
+
v_int32
::
nlanes
);
v_int32
s2
=
vx_load
(
S1
+
i
+
2
*
v_int32
::
nlanes
);
v_int32
s3
=
vx_load
(
S1
+
i
+
3
*
v_int32
::
nlanes
);
v_store
(
dst
+
i
,
v_pack
(
vx_load
(
S0
+
i
)
+
vx_load
(
S2
+
i
)
+
(
s0
+
s0
),
vx_load
(
S0
+
i
+
v_int32
::
nlanes
)
+
vx_load
(
S2
+
i
+
v_int32
::
nlanes
)
+
(
s1
+
s1
))
+
d8
);
v_store
(
dst
+
i
+
v_int16
::
nlanes
,
v_pack
(
vx_load
(
S0
+
i
+
2
*
v_int32
::
nlanes
)
+
vx_load
(
S2
+
i
+
2
*
v_int32
::
nlanes
)
+
(
s2
+
s2
),
vx_load
(
S0
+
i
+
3
*
v_int32
::
nlanes
)
+
vx_load
(
S2
+
i
+
3
*
v_int32
::
nlanes
)
+
(
s3
+
s3
))
+
d8
);
}
if
(
i
<=
width
-
v_int16
::
nlanes
)
{
v_int32
sl
=
vx_load
(
S1
+
i
);
v_int32
sh
=
vx_load
(
S1
+
i
+
v_int32
::
nlanes
);
v_store
(
dst
+
i
,
v_pack
(
vx_load
(
S0
+
i
)
+
vx_load
(
S2
+
i
)
+
d4
+
(
sl
+
sl
),
vx_load
(
S0
+
i
+
v_int32
::
nlanes
)
+
vx_load
(
S2
+
i
+
v_int32
::
nlanes
)
+
d4
+
(
sh
+
sh
)));
v_store
(
dst
+
i
,
v_pack
(
vx_load
(
S0
+
i
)
+
vx_load
(
S2
+
i
)
+
(
sl
+
sl
),
vx_load
(
S0
+
i
+
v_int32
::
nlanes
)
+
vx_load
(
S2
+
i
+
v_int32
::
nlanes
)
+
(
sh
+
sh
))
+
d8
);
i
+=
v_int16
::
nlanes
;
}
if
(
i
<=
width
-
v_int32
::
nlanes
)
{
v_int32
s
=
vx_load
(
S1
+
i
);
v_pack_store
(
dst
+
i
,
vx_load
(
S0
+
i
)
+
vx_load
(
S2
+
i
)
+
d4
+
(
s
+
s
));
v_pack_store
(
dst
+
i
,
vx_load
(
S0
+
i
)
+
vx_load
(
S2
+
i
)
+
vx_setall_s32
(
d
)
+
(
s
+
s
));
i
+=
v_int32
::
nlanes
;
}
}
else
if
(
ky
[
0
]
==
-
2
&&
ky
[
1
]
==
1
)
{
for
(
;
i
<=
width
-
v_int16
::
nlanes
;
i
+=
v_int16
::
nlanes
)
for
(
;
i
<=
width
-
2
*
v_int16
::
nlanes
;
i
+=
2
*
v_int16
::
nlanes
)
{
v_int32
s0
=
vx_load
(
S1
+
i
);
v_int32
s1
=
vx_load
(
S1
+
i
+
v_int32
::
nlanes
);
v_int32
s2
=
vx_load
(
S1
+
i
+
2
*
v_int32
::
nlanes
);
v_int32
s3
=
vx_load
(
S1
+
i
+
3
*
v_int32
::
nlanes
);
v_store
(
dst
+
i
,
v_pack
(
vx_load
(
S0
+
i
)
+
vx_load
(
S2
+
i
)
-
(
s0
+
s0
),
vx_load
(
S0
+
i
+
v_int32
::
nlanes
)
+
vx_load
(
S2
+
i
+
v_int32
::
nlanes
)
-
(
s1
+
s1
))
+
d8
);
v_store
(
dst
+
i
+
v_int16
::
nlanes
,
v_pack
(
vx_load
(
S0
+
i
+
2
*
v_int32
::
nlanes
)
+
vx_load
(
S2
+
i
+
2
*
v_int32
::
nlanes
)
-
(
s2
+
s2
),
vx_load
(
S0
+
i
+
3
*
v_int32
::
nlanes
)
+
vx_load
(
S2
+
i
+
3
*
v_int32
::
nlanes
)
-
(
s3
+
s3
))
+
d8
);
}
if
(
i
<=
width
-
v_int16
::
nlanes
)
{
v_int32
sl
=
vx_load
(
S1
+
i
);
v_int32
sh
=
vx_load
(
S1
+
i
+
v_int32
::
nlanes
);
v_store
(
dst
+
i
,
v_pack
(
vx_load
(
S0
+
i
)
+
vx_load
(
S2
+
i
)
+
d4
-
(
sl
+
sl
),
vx_load
(
S0
+
i
+
v_int32
::
nlanes
)
+
vx_load
(
S2
+
i
+
v_int32
::
nlanes
)
+
d4
-
(
sh
+
sh
)));
v_store
(
dst
+
i
,
v_pack
(
vx_load
(
S0
+
i
)
+
vx_load
(
S2
+
i
)
-
(
sl
+
sl
),
vx_load
(
S0
+
i
+
v_int32
::
nlanes
)
+
vx_load
(
S2
+
i
+
v_int32
::
nlanes
)
-
(
sh
+
sh
))
+
d8
);
i
+=
v_int16
::
nlanes
;
}
if
(
i
<=
width
-
v_int32
::
nlanes
)
{
v_int32
s
=
vx_load
(
S1
+
i
);
v_pack_store
(
dst
+
i
,
vx_load
(
S0
+
i
)
+
vx_load
(
S2
+
i
)
+
d4
-
(
s
+
s
));
v_pack_store
(
dst
+
i
,
vx_load
(
S0
+
i
)
+
vx_load
(
S2
+
i
)
+
vx_setall_s32
(
d
)
-
(
s
+
s
));
i
+=
v_int32
::
nlanes
;
}
}
#if CV_NEON
else
if
(
ky
[
0
]
==
(
float
)((
int
)
ky
[
0
])
&&
ky
[
1
]
==
(
float
)((
int
)
ky
[
1
])
)
{
v_int32
k0
=
vx_setall_s32
((
int
)
ky
[
0
]),
k1
=
vx_setall_s32
((
int
)
ky
[
1
]);
for
(
;
i
<=
width
-
v_int16
::
nlanes
;
i
+=
v_int16
::
nlanes
)
v_int32
d4
=
vx_setall_s32
(
d
);
for
(
;
i
<=
width
-
2
*
v_int16
::
nlanes
;
i
+=
2
*
v_int16
::
nlanes
)
{
v_store
(
dst
+
i
,
v_pack
(
v_muladd
(
vx_load
(
S0
+
i
)
+
vx_load
(
S2
+
i
),
k1
,
v_muladd
(
vx_load
(
S1
+
i
),
k0
,
d4
)),
v_muladd
(
vx_load
(
S0
+
i
+
v_int32
::
nlanes
)
+
vx_load
(
S2
+
i
+
v_int32
::
nlanes
),
k1
,
v_muladd
(
vx_load
(
S1
+
i
+
v_int32
::
nlanes
),
k0
,
d4
))));
v_store
(
dst
+
i
+
v_int16
::
nlanes
,
v_pack
(
v_muladd
(
vx_load
(
S0
+
i
+
2
*
v_int32
::
nlanes
)
+
vx_load
(
S2
+
i
+
2
*
v_int32
::
nlanes
),
k1
,
v_muladd
(
vx_load
(
S1
+
i
+
2
*
v_int32
::
nlanes
),
k0
,
d4
)),
v_muladd
(
vx_load
(
S0
+
i
+
3
*
v_int32
::
nlanes
)
+
vx_load
(
S2
+
i
+
3
*
v_int32
::
nlanes
),
k1
,
v_muladd
(
vx_load
(
S1
+
i
+
3
*
v_int32
::
nlanes
),
k0
,
d4
))));
}
if
(
i
<=
width
-
v_int16
::
nlanes
)
{
v_store
(
dst
+
i
,
v_pack
(
v_muladd
(
vx_load
(
S0
+
i
)
+
vx_load
(
S2
+
i
),
k1
,
v_muladd
(
vx_load
(
S1
+
i
),
k0
,
d4
)),
v_muladd
(
vx_load
(
S0
+
i
+
v_int32
::
nlanes
)
+
vx_load
(
S2
+
i
+
v_int32
::
nlanes
),
k1
,
v_muladd
(
vx_load
(
S1
+
i
+
v_int32
::
nlanes
),
k0
,
d4
))));
i
+=
v_int16
::
nlanes
;
}
if
(
i
<=
width
-
v_int32
::
nlanes
)
{
v_pack_store
(
dst
+
i
,
v_muladd
(
vx_load
(
S0
+
i
)
+
vx_load
(
S2
+
i
),
k1
,
v_muladd
(
vx_load
(
S1
+
i
),
k0
,
d4
)));
i
+=
v_int32
::
nlanes
;
}
}
#endif
else
{
v_float32
k0
=
vx_setall_f32
(
ky
[
0
]),
k1
=
vx_setall_f32
(
ky
[
1
]);
for
(
;
i
<=
width
-
v_int16
::
nlanes
;
i
+=
v_int16
::
nlanes
)
for
(
;
i
<=
width
-
2
*
v_int16
::
nlanes
;
i
+=
2
*
v_int16
::
nlanes
)
{
v_store
(
dst
+
i
,
v_pack
(
v_round
(
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
+
i
)
+
vx_load
(
S2
+
i
)),
k1
,
v_muladd
(
v_cvt_f32
(
vx_load
(
S1
+
i
)),
k0
,
df4
))),
v_round
(
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
+
i
+
v_int32
::
nlanes
)
+
vx_load
(
S2
+
i
+
v_int32
::
nlanes
)),
k1
,
v_muladd
(
v_cvt_f32
(
vx_load
(
S1
+
i
+
v_int32
::
nlanes
)),
k0
,
df4
)))));
v_store
(
dst
+
i
+
v_int16
::
nlanes
,
v_pack
(
v_round
(
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
+
i
+
2
*
v_int32
::
nlanes
)
+
vx_load
(
S2
+
i
+
2
*
v_int32
::
nlanes
)),
k1
,
v_muladd
(
v_cvt_f32
(
vx_load
(
S1
+
i
+
2
*
v_int32
::
nlanes
)),
k0
,
df4
))),
v_round
(
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
+
i
+
3
*
v_int32
::
nlanes
)
+
vx_load
(
S2
+
i
+
3
*
v_int32
::
nlanes
)),
k1
,
v_muladd
(
v_cvt_f32
(
vx_load
(
S1
+
i
+
3
*
v_int32
::
nlanes
)),
k0
,
df4
)))));
}
if
(
i
<=
width
-
v_int16
::
nlanes
)
{
v_store
(
dst
+
i
,
v_pack
(
v_round
(
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
+
i
)
+
vx_load
(
S2
+
i
)),
k1
,
v_muladd
(
v_cvt_f32
(
vx_load
(
S1
+
i
)),
k0
,
df4
))),
v_round
(
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
+
i
+
v_int32
::
nlanes
)
+
vx_load
(
S2
+
i
+
v_int32
::
nlanes
)),
k1
,
v_muladd
(
v_cvt_f32
(
vx_load
(
S1
+
i
+
v_int32
::
nlanes
)),
k0
,
df4
)))));
i
+=
v_int16
::
nlanes
;
}
if
(
i
<=
width
-
v_int32
::
nlanes
)
{
v_pack_store
(
dst
+
i
,
v_round
(
v_muladd
(
v_cvt_f32
(
vx_load
(
S0
+
i
)
+
vx_load
(
S2
+
i
)),
k1
,
v_muladd
(
v_cvt_f32
(
vx_load
(
S1
+
i
)),
k0
,
df4
))));
...
...
@@ -1314,20 +1381,38 @@ struct SymmColumnSmallVec_32s16s
{
if
(
ky
[
1
]
<
0
)
std
::
swap
(
S0
,
S2
);
for
(
;
i
<=
width
-
v_int16
::
nlanes
;
i
+=
v_int16
::
nlanes
)
v_store
(
dst
+
i
,
v_pack
(
vx_load
(
S2
+
i
)
-
vx_load
(
S0
+
i
)
+
d4
,
vx_load
(
S2
+
i
+
v_int32
::
nlanes
)
-
vx_load
(
S0
+
i
+
v_int32
::
nlanes
)
+
d4
));
for
(
;
i
<=
width
-
2
*
v_int16
::
nlanes
;
i
+=
2
*
v_int16
::
nlanes
)
{
v_store
(
dst
+
i
,
v_pack
(
vx_load
(
S2
+
i
)
-
vx_load
(
S0
+
i
),
vx_load
(
S2
+
i
+
v_int32
::
nlanes
)
-
vx_load
(
S0
+
i
+
v_int32
::
nlanes
))
+
d8
);
v_store
(
dst
+
i
+
v_int16
::
nlanes
,
v_pack
(
vx_load
(
S2
+
i
+
2
*
v_int32
::
nlanes
)
-
vx_load
(
S0
+
i
+
2
*
v_int32
::
nlanes
),
vx_load
(
S2
+
i
+
3
*
v_int32
::
nlanes
)
-
vx_load
(
S0
+
i
+
3
*
v_int32
::
nlanes
))
+
d8
);
}
if
(
i
<=
width
-
v_int16
::
nlanes
)
{
v_store
(
dst
+
i
,
v_pack
(
vx_load
(
S2
+
i
)
-
vx_load
(
S0
+
i
),
vx_load
(
S2
+
i
+
v_int32
::
nlanes
)
-
vx_load
(
S0
+
i
+
v_int32
::
nlanes
))
+
d8
);
i
+=
v_int16
::
nlanes
;
}
if
(
i
<=
width
-
v_int32
::
nlanes
)
{
v_pack_store
(
dst
+
i
,
vx_load
(
S2
+
i
)
-
vx_load
(
S0
+
i
)
+
d4
);
v_pack_store
(
dst
+
i
,
vx_load
(
S2
+
i
)
-
vx_load
(
S0
+
i
)
+
vx_setall_s32
(
d
)
);
i
+=
v_int32
::
nlanes
;
}
}
else
{
v_float32
k1
=
vx_setall_f32
(
ky
[
1
]);
for
(
;
i
<=
width
-
v_int16
::
nlanes
;
i
+=
v_int16
::
nlanes
)
for
(
;
i
<=
width
-
2
*
v_int16
::
nlanes
;
i
+=
2
*
v_int16
::
nlanes
)
{
v_store
(
dst
+
i
,
v_pack
(
v_round
(
v_muladd
(
v_cvt_f32
(
vx_load
(
S2
+
i
)
-
vx_load
(
S0
+
i
)),
k1
,
df4
)),
v_round
(
v_muladd
(
v_cvt_f32
(
vx_load
(
S2
+
i
+
v_int32
::
nlanes
)
-
vx_load
(
S0
+
i
+
v_int32
::
nlanes
)),
k1
,
df4
))));
v_store
(
dst
+
i
+
v_int16
::
nlanes
,
v_pack
(
v_round
(
v_muladd
(
v_cvt_f32
(
vx_load
(
S2
+
i
+
2
*
v_int32
::
nlanes
)
-
vx_load
(
S0
+
i
+
2
*
v_int32
::
nlanes
)),
k1
,
df4
)),
v_round
(
v_muladd
(
v_cvt_f32
(
vx_load
(
S2
+
i
+
3
*
v_int32
::
nlanes
)
-
vx_load
(
S0
+
i
+
3
*
v_int32
::
nlanes
)),
k1
,
df4
))));
}
if
(
i
<=
width
-
v_int16
::
nlanes
)
{
v_store
(
dst
+
i
,
v_pack
(
v_round
(
v_muladd
(
v_cvt_f32
(
vx_load
(
S2
+
i
)
-
vx_load
(
S0
+
i
)),
k1
,
df4
)),
v_round
(
v_muladd
(
v_cvt_f32
(
vx_load
(
S2
+
i
+
v_int32
::
nlanes
)
-
vx_load
(
S0
+
i
+
v_int32
::
nlanes
)),
k1
,
df4
))));
i
+=
v_int16
::
nlanes
;
}
if
(
i
<=
width
-
v_int32
::
nlanes
)
{
v_pack_store
(
dst
+
i
,
v_round
(
v_muladd
(
v_cvt_f32
(
vx_load
(
S2
+
i
)
-
vx_load
(
S0
+
i
)),
k1
,
df4
)));
...
...
@@ -1336,6 +1421,7 @@ struct SymmColumnSmallVec_32s16s
}
}
vx_cleanup
();
return
i
;
}
...
...
@@ -1362,19 +1448,43 @@ struct RowVec_16s32f
const
float
*
_kx
=
kernel
.
ptr
<
float
>
();
width
*=
cn
;
for
(
;
i
<=
width
-
v_int16
::
nlanes
;
i
+=
v_int16
::
nlanes
)
for
(
;
i
<=
width
-
2
*
v_int16
::
nlanes
;
i
+=
2
*
v_int16
::
nlanes
)
{
const
short
*
src
=
(
const
short
*
)
_src
+
i
;
v_float32
s0
=
vx_setzero_f32
();
v_float32
s1
=
vx_setzero_f32
();
v_float32
s2
=
vx_setzero_f32
();
v_float32
s3
=
vx_setzero_f32
();
for
(
k
=
0
;
k
<
_ksize
;
k
++
,
src
+=
cn
)
{
v_float32
f
=
vx_setall_f32
(
_kx
[
k
]);
v_int16
xl
=
vx_load
(
src
);
v_int16
xh
=
vx_load
(
src
+
v_int16
::
nlanes
);
s0
=
v_muladd
(
v_cvt_f32
(
v_expand_low
(
xl
)),
f
,
s0
);
s1
=
v_muladd
(
v_cvt_f32
(
v_expand_high
(
xl
)),
f
,
s1
);
s2
=
v_muladd
(
v_cvt_f32
(
v_expand_low
(
xh
)),
f
,
s2
);
s3
=
v_muladd
(
v_cvt_f32
(
v_expand_high
(
xh
)),
f
,
s3
);
}
v_store
(
dst
+
i
,
s0
);
v_store
(
dst
+
i
+
v_float32
::
nlanes
,
s1
);
v_store
(
dst
+
i
+
2
*
v_float32
::
nlanes
,
s2
);
v_store
(
dst
+
i
+
3
*
v_float32
::
nlanes
,
s3
);
}
if
(
i
<=
width
-
v_int16
::
nlanes
)
{
const
short
*
src
=
(
const
short
*
)
_src
+
i
;
v_float32
s0
=
vx_setzero_f32
();
v_float32
s1
=
vx_setzero_f32
();
for
(
k
=
0
;
k
<
_ksize
;
k
++
,
src
+=
cn
)
{
v_float32
f
=
vx_setall_f32
(
_kx
[
k
]);
v_int16
x
=
vx_load
(
src
);
s0
=
v_muladd
(
v_cvt_f32
(
v_expand_low
(
x
)),
vx_setall_f32
(
_kx
[
k
])
,
s0
);
s1
=
v_muladd
(
v_cvt_f32
(
v_expand_high
(
x
)),
vx_setall_f32
(
_kx
[
k
])
,
s1
);
s0
=
v_muladd
(
v_cvt_f32
(
v_expand_low
(
x
)),
f
,
s0
);
s1
=
v_muladd
(
v_cvt_f32
(
v_expand_high
(
x
)),
f
,
s1
);
}
v_store
(
dst
+
i
,
s0
);
v_store
(
dst
+
i
+
v_float32
::
nlanes
,
s1
);
i
+=
v_int16
::
nlanes
;
}
if
(
i
<=
width
-
v_float32
::
nlanes
)
{
...
...
@@ -1385,6 +1495,7 @@ struct RowVec_16s32f
v_store
(
dst
+
i
,
s0
);
i
+=
v_float32
::
nlanes
;
}
vx_cleanup
();
return
i
;
}
...
...
@@ -1406,6 +1517,8 @@ struct SymmColumnVec_32f16s
int
operator
()(
const
uchar
**
_src
,
uchar
*
_dst
,
int
width
)
const
{
int
_ksize
=
kernel
.
rows
+
kernel
.
cols
-
1
;
if
(
_ksize
==
1
)
return
0
;
int
ksize2
=
_ksize
/
2
;
const
float
*
ky
=
kernel
.
ptr
<
float
>
()
+
ksize2
;
int
i
=
0
,
k
;
...
...
@@ -1416,25 +1529,49 @@ struct SymmColumnVec_32f16s
v_float32
d4
=
vx_setall_f32
(
delta
);
if
(
symmetrical
)
{
if
(
_ksize
==
1
)
return
0
;
v_float32
k0
=
vx_setall_f32
(
ky
[
0
]);
for
(
;
i
<=
width
-
v_int16
::
nlanes
;
i
+=
v_int16
::
nlanes
)
v_float32
k1
=
vx_setall_f32
(
ky
[
1
]);
for
(
;
i
<=
width
-
2
*
v_int16
::
nlanes
;
i
+=
2
*
v_int16
::
nlanes
)
{
v_float32
s0
=
v_muladd
(
vx_load
(
src
[
0
]
+
i
),
k0
,
d4
);
v_float32
s1
=
v_muladd
(
vx_load
(
src
[
0
]
+
i
+
v_float32
::
nlanes
),
k0
,
d4
);
for
(
k
=
1
;
k
<=
ksize2
;
k
++
)
v_float32
s2
=
v_muladd
(
vx_load
(
src
[
0
]
+
i
+
2
*
v_float32
::
nlanes
),
k0
,
d4
);
v_float32
s3
=
v_muladd
(
vx_load
(
src
[
0
]
+
i
+
3
*
v_float32
::
nlanes
),
k0
,
d4
);
s0
=
v_muladd
(
vx_load
(
src
[
1
]
+
i
)
+
vx_load
(
src
[
-
1
]
+
i
),
k1
,
s0
);
s1
=
v_muladd
(
vx_load
(
src
[
1
]
+
i
+
v_float32
::
nlanes
)
+
vx_load
(
src
[
-
1
]
+
i
+
v_float32
::
nlanes
),
k1
,
s1
);
s2
=
v_muladd
(
vx_load
(
src
[
1
]
+
i
+
2
*
v_float32
::
nlanes
)
+
vx_load
(
src
[
-
1
]
+
i
+
2
*
v_float32
::
nlanes
),
k1
,
s2
);
s3
=
v_muladd
(
vx_load
(
src
[
1
]
+
i
+
3
*
v_float32
::
nlanes
)
+
vx_load
(
src
[
-
1
]
+
i
+
3
*
v_float32
::
nlanes
),
k1
,
s3
);
for
(
k
=
2
;
k
<=
ksize2
;
k
++
)
{
v_float32
k1
=
vx_setall_f32
(
ky
[
k
]);
s0
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
)
+
vx_load
(
src
[
-
k
]
+
i
),
k1
,
s0
);
s1
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
+
v_float32
::
nlanes
)
+
vx_load
(
src
[
-
k
]
+
i
+
v_float32
::
nlanes
),
k1
,
s1
);
v_float32
k2
=
vx_setall_f32
(
ky
[
k
]);
s0
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
)
+
vx_load
(
src
[
-
k
]
+
i
),
k2
,
s0
);
s1
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
+
v_float32
::
nlanes
)
+
vx_load
(
src
[
-
k
]
+
i
+
v_float32
::
nlanes
),
k2
,
s1
);
s2
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
+
2
*
v_float32
::
nlanes
)
+
vx_load
(
src
[
-
k
]
+
i
+
2
*
v_float32
::
nlanes
),
k2
,
s2
);
s3
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
+
3
*
v_float32
::
nlanes
)
+
vx_load
(
src
[
-
k
]
+
i
+
3
*
v_float32
::
nlanes
),
k2
,
s3
);
}
v_store
(
dst
+
i
,
v_pack
(
v_round
(
s0
),
v_round
(
s1
)));
v_store
(
dst
+
i
+
v_int16
::
nlanes
,
v_pack
(
v_round
(
s2
),
v_round
(
s3
)));
}
if
(
i
<=
width
-
v_int16
::
nlanes
)
{
v_float32
s0
=
v_muladd
(
vx_load
(
src
[
0
]
+
i
),
k0
,
d4
);
v_float32
s1
=
v_muladd
(
vx_load
(
src
[
0
]
+
i
+
v_float32
::
nlanes
),
k0
,
d4
);
s0
=
v_muladd
(
vx_load
(
src
[
1
]
+
i
)
+
vx_load
(
src
[
-
1
]
+
i
),
k1
,
s0
);
s1
=
v_muladd
(
vx_load
(
src
[
1
]
+
i
+
v_float32
::
nlanes
)
+
vx_load
(
src
[
-
1
]
+
i
+
v_float32
::
nlanes
),
k1
,
s1
);
for
(
k
=
2
;
k
<=
ksize2
;
k
++
)
{
v_float32
k2
=
vx_setall_f32
(
ky
[
k
]);
s0
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
)
+
vx_load
(
src
[
-
k
]
+
i
),
k2
,
s0
);
s1
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
+
v_float32
::
nlanes
)
+
vx_load
(
src
[
-
k
]
+
i
+
v_float32
::
nlanes
),
k2
,
s1
);
}
v_store
(
dst
+
i
,
v_pack
(
v_round
(
s0
),
v_round
(
s1
)));
i
+=
v_int16
::
nlanes
;
}
if
(
i
<=
width
-
v_float32
::
nlanes
)
{
v_float32
s0
=
v_muladd
(
vx_load
(
src
[
0
]
+
i
),
k0
,
d4
);
for
(
k
=
1
;
k
<=
ksize2
;
k
++
)
s0
=
v_muladd
(
vx_load
(
src
[
1
]
+
i
)
+
vx_load
(
src
[
-
1
]
+
i
),
k1
,
s0
);
for
(
k
=
2
;
k
<=
ksize2
;
k
++
)
s0
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
)
+
vx_load
(
src
[
-
k
]
+
i
),
vx_setall_f32
(
ky
[
k
]),
s0
);
v_pack_store
(
dst
+
i
,
v_round
(
s0
));
i
+=
v_float32
::
nlanes
;
...
...
@@ -1442,28 +1579,48 @@ struct SymmColumnVec_32f16s
}
else
{
for
(
;
i
<=
width
-
v_int16
::
nlanes
;
i
+=
v_int16
::
nlanes
)
v_float32
k1
=
vx_setall_f32
(
ky
[
1
]);
for
(
;
i
<=
width
-
2
*
v_int16
::
nlanes
;
i
+=
2
*
v_int16
::
nlanes
)
{
v_float32
s0
=
d4
;
v_float32
s1
=
d4
;
for
(
k
=
1
;
k
<=
ksize2
;
k
++
)
v_float32
s0
=
v_muladd
(
vx_load
(
src
[
1
]
+
i
)
-
vx_load
(
src
[
-
1
]
+
i
),
k1
,
d4
);
v_float32
s1
=
v_muladd
(
vx_load
(
src
[
1
]
+
i
+
v_float32
::
nlanes
)
-
vx_load
(
src
[
-
1
]
+
i
+
v_float32
::
nlanes
),
k1
,
d4
);
v_float32
s2
=
v_muladd
(
vx_load
(
src
[
1
]
+
i
+
2
*
v_float32
::
nlanes
)
-
vx_load
(
src
[
-
1
]
+
i
+
2
*
v_float32
::
nlanes
),
k1
,
d4
);
v_float32
s3
=
v_muladd
(
vx_load
(
src
[
1
]
+
i
+
3
*
v_float32
::
nlanes
)
-
vx_load
(
src
[
-
1
]
+
i
+
3
*
v_float32
::
nlanes
),
k1
,
d4
);
for
(
k
=
2
;
k
<=
ksize2
;
k
++
)
{
v_float32
k1
=
vx_setall_f32
(
ky
[
k
]);
s0
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
)
-
vx_load
(
src
[
-
k
]
+
i
),
k1
,
s0
);
s1
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
+
v_float32
::
nlanes
)
-
vx_load
(
src
[
-
k
]
+
i
+
v_float32
::
nlanes
),
k1
,
s1
);
v_float32
k2
=
vx_setall_f32
(
ky
[
k
]);
s0
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
)
-
vx_load
(
src
[
-
k
]
+
i
),
k2
,
s0
);
s1
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
+
v_float32
::
nlanes
)
-
vx_load
(
src
[
-
k
]
+
i
+
v_float32
::
nlanes
),
k2
,
s1
);
s0
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
+
2
*
v_float32
::
nlanes
)
-
vx_load
(
src
[
-
k
]
+
i
+
2
*
v_float32
::
nlanes
),
k2
,
s2
);
s1
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
+
3
*
v_float32
::
nlanes
)
-
vx_load
(
src
[
-
k
]
+
i
+
3
*
v_float32
::
nlanes
),
k2
,
s3
);
}
v_store
(
dst
+
i
,
v_pack
(
v_round
(
s0
),
v_round
(
s1
)));
v_store
(
dst
+
i
+
v_int16
::
nlanes
,
v_pack
(
v_round
(
s2
),
v_round
(
s3
)));
}
if
(
i
<=
width
-
v_int16
::
nlanes
)
{
v_float32
s0
=
v_muladd
(
vx_load
(
src
[
1
]
+
i
)
-
vx_load
(
src
[
-
1
]
+
i
),
k1
,
d4
);
v_float32
s1
=
v_muladd
(
vx_load
(
src
[
1
]
+
i
+
v_float32
::
nlanes
)
-
vx_load
(
src
[
-
1
]
+
i
+
v_float32
::
nlanes
),
k1
,
d4
);
for
(
k
=
2
;
k
<=
ksize2
;
k
++
)
{
v_float32
k2
=
vx_setall_f32
(
ky
[
k
]);
s0
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
)
-
vx_load
(
src
[
-
k
]
+
i
),
k2
,
s0
);
s1
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
+
v_float32
::
nlanes
)
-
vx_load
(
src
[
-
k
]
+
i
+
v_float32
::
nlanes
),
k2
,
s1
);
}
v_store
(
dst
+
i
,
v_pack
(
v_round
(
s0
),
v_round
(
s1
)));
i
+=
v_int16
::
nlanes
;
}
if
(
i
<=
width
-
v_float32
::
nlanes
)
{
v_float32
s0
=
d4
;
for
(
k
=
1
;
k
<=
ksize2
;
k
++
)
v_float32
s0
=
v_muladd
(
vx_load
(
src
[
1
]
+
i
)
-
vx_load
(
src
[
-
1
]
+
i
),
k1
,
d4
)
;
for
(
k
=
2
;
k
<=
ksize2
;
k
++
)
s0
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
)
-
vx_load
(
src
[
-
k
]
+
i
),
vx_setall_f32
(
ky
[
k
]),
s0
);
v_pack_store
(
dst
+
i
,
v_round
(
s0
));
i
+=
v_float32
::
nlanes
;
}
}
vx_cleanup
();
return
i
;
}
...
...
@@ -1505,6 +1662,7 @@ struct RowVec_32f
}
#endif
int
_ksize
=
kernel
.
rows
+
kernel
.
cols
-
1
;
CV_DbgAssert
(
_ksize
>
0
);
const
float
*
src0
=
(
const
float
*
)
_src
;
float
*
dst
=
(
float
*
)
_dst
;
const
float
*
_kx
=
kernel
.
ptr
<
float
>
();
...
...
@@ -1516,14 +1674,55 @@ struct RowVec_32f
if
(
haveAVX2
)
return
RowVec_32f_AVX
(
src0
,
_kx
,
dst
,
width
,
cn
,
_ksize
);
#endif
for
(
;
i
<=
width
-
v_float32
::
nlanes
;
i
+=
v_float32
::
nlanes
)
v_float32
k0
=
vx_setall_f32
(
_kx
[
0
]);
for
(
;
i
<=
width
-
4
*
v_float32
::
nlanes
;
i
+=
4
*
v_float32
::
nlanes
)
{
const
float
*
src
=
src0
+
i
;
v_float32
s0
=
vx_setzero_f32
();
for
(
k
=
0
;
k
<
_ksize
;
k
++
,
src
+=
cn
)
v_float32
s0
=
vx_load
(
src
)
*
k0
;
v_float32
s1
=
vx_load
(
src
+
v_float32
::
nlanes
)
*
k0
;
v_float32
s2
=
vx_load
(
src
+
2
*
v_float32
::
nlanes
)
*
k0
;
v_float32
s3
=
vx_load
(
src
+
3
*
v_float32
::
nlanes
)
*
k0
;
src
+=
cn
;
for
(
k
=
1
;
k
<
_ksize
;
k
++
,
src
+=
cn
)
{
v_float32
k1
=
vx_setall_f32
(
_kx
[
k
]);
s0
=
v_muladd
(
vx_load
(
src
),
k1
,
s0
);
s1
=
v_muladd
(
vx_load
(
src
+
v_float32
::
nlanes
),
k1
,
s1
);
s2
=
v_muladd
(
vx_load
(
src
+
2
*
v_float32
::
nlanes
),
k1
,
s2
);
s3
=
v_muladd
(
vx_load
(
src
+
3
*
v_float32
::
nlanes
),
k1
,
s3
);
}
v_store
(
dst
+
i
,
s0
);
v_store
(
dst
+
i
+
v_float32
::
nlanes
,
s1
);
v_store
(
dst
+
i
+
2
*
v_float32
::
nlanes
,
s2
);
v_store
(
dst
+
i
+
3
*
v_float32
::
nlanes
,
s3
);
}
if
(
i
<=
width
-
2
*
v_float32
::
nlanes
)
{
const
float
*
src
=
src0
+
i
;
v_float32
s0
=
vx_load
(
src
)
*
k0
;
v_float32
s1
=
vx_load
(
src
+
v_float32
::
nlanes
)
*
k0
;
src
+=
cn
;
for
(
k
=
1
;
k
<
_ksize
;
k
++
,
src
+=
cn
)
{
v_float32
k1
=
vx_setall_f32
(
_kx
[
k
]);
s0
=
v_muladd
(
vx_load
(
src
),
k1
,
s0
);
s1
=
v_muladd
(
vx_load
(
src
+
v_float32
::
nlanes
),
k1
,
s1
);
}
v_store
(
dst
+
i
,
s0
);
v_store
(
dst
+
i
+
v_float32
::
nlanes
,
s1
);
i
+=
2
*
v_float32
::
nlanes
;
}
if
(
i
<=
width
-
v_float32
::
nlanes
)
{
const
float
*
src
=
src0
+
i
;
v_float32
s0
=
vx_load
(
src
)
*
k0
;
src
+=
cn
;
for
(
k
=
1
;
k
<
_ksize
;
k
++
,
src
+=
cn
)
s0
=
v_muladd
(
vx_load
(
src
),
vx_setall_f32
(
_kx
[
k
]),
s0
);
v_store
(
dst
+
i
,
s0
);
i
+=
v_float32
::
nlanes
;
}
vx_cleanup
();
return
i
;
}
...
...
@@ -1584,6 +1783,8 @@ struct SymmRowSmallVec_32f
int
operator
()(
const
uchar
*
_src
,
uchar
*
_dst
,
int
width
,
int
cn
)
const
{
int
i
=
0
,
_ksize
=
kernel
.
rows
+
kernel
.
cols
-
1
;
if
(
_ksize
==
1
)
return
0
;
float
*
dst
=
(
float
*
)
_dst
;
const
float
*
src
=
(
const
float
*
)
_src
+
(
_ksize
/
2
)
*
cn
;
bool
symmetrical
=
(
symmetryType
&
KERNEL_SYMMETRICAL
)
!=
0
;
...
...
@@ -1592,15 +1793,28 @@ struct SymmRowSmallVec_32f
if
(
symmetrical
)
{
if
(
_ksize
==
1
)
return
0
;
if
(
_ksize
==
3
)
{
if
(
fabs
(
kx
[
0
])
==
2
&&
kx
[
1
]
==
1
)
{
#if CV_FMA3 || CV_AVX2
v_float32
k0
=
vx_setall_f32
(
kx
[
0
]);
for
(
;
i
<=
width
-
v_float32
::
nlanes
;
i
+=
v_float32
::
nlanes
,
src
+=
v_float32
::
nlanes
)
v_store
(
dst
+
i
,
v_muladd
(
vx_load
(
src
),
k0
,
vx_load
(
src
-
cn
)
+
vx_load
(
src
+
cn
)));
#else
if
(
kx
[
0
]
>
0
)
for
(
;
i
<=
width
-
v_float32
::
nlanes
;
i
+=
v_float32
::
nlanes
,
src
+=
v_float32
::
nlanes
)
{
v_float32
x
=
vx_load
(
src
);
v_store
(
dst
+
i
,
vx_load
(
src
-
cn
)
+
vx_load
(
src
+
cn
)
+
(
x
+
x
));
}
else
for
(
;
i
<=
width
-
v_float32
::
nlanes
;
i
+=
v_float32
::
nlanes
,
src
+=
v_float32
::
nlanes
)
{
v_float32
x
=
vx_load
(
src
);
v_store
(
dst
+
i
,
vx_load
(
src
-
cn
)
+
vx_load
(
src
+
cn
)
-
(
x
+
x
));
}
#endif
}
else
{
...
...
@@ -1613,9 +1827,17 @@ struct SymmRowSmallVec_32f
{
if
(
kx
[
0
]
==
-
2
&&
kx
[
1
]
==
0
&&
kx
[
2
]
==
1
)
{
#if CV_FMA3 || CV_AVX2
v_float32
k0
=
vx_setall_f32
(
-
2
);
for
(
;
i
<=
width
-
v_float32
::
nlanes
;
i
+=
v_float32
::
nlanes
,
src
+=
v_float32
::
nlanes
)
v_store
(
dst
+
i
,
v_muladd
(
vx_load
(
src
),
k0
,
vx_load
(
src
-
2
*
cn
)
+
vx_load
(
src
+
2
*
cn
)));
#else
for
(
;
i
<=
width
-
v_float32
::
nlanes
;
i
+=
v_float32
::
nlanes
,
src
+=
v_float32
::
nlanes
)
{
v_float32
x
=
vx_load
(
src
);
v_store
(
dst
+
i
,
vx_load
(
src
-
2
*
cn
)
+
vx_load
(
src
+
2
*
cn
)
-
(
x
+
x
));
}
#endif
}
else
{
...
...
@@ -1647,6 +1869,7 @@ struct SymmRowSmallVec_32f
}
}
vx_cleanup
();
return
i
;
}
...
...
@@ -1688,12 +1911,47 @@ struct SymmColumnVec_32f
return
SymmColumnVec_32f_Symm_AVX
(
src
,
ky
,
dst
,
delta
,
width
,
ksize2
);
#endif
const
v_float32
d4
=
vx_setall_f32
(
delta
);
for
(
;
i
<=
width
-
v_float32
::
nlanes
;
i
+=
v_float32
::
nlanes
)
const
v_float32
k0
=
vx_setall_f32
(
ky
[
0
]);
for
(
;
i
<=
width
-
4
*
v_float32
::
nlanes
;
i
+=
4
*
v_float32
::
nlanes
)
{
v_float32
s0
=
v_muladd
(
vx_load
(
src
[
0
]
+
i
),
k0
,
d4
);
v_float32
s1
=
v_muladd
(
vx_load
(
src
[
0
]
+
i
+
v_float32
::
nlanes
),
k0
,
d4
);
v_float32
s2
=
v_muladd
(
vx_load
(
src
[
0
]
+
i
+
2
*
v_float32
::
nlanes
),
k0
,
d4
);
v_float32
s3
=
v_muladd
(
vx_load
(
src
[
0
]
+
i
+
3
*
v_float32
::
nlanes
),
k0
,
d4
);
for
(
k
=
1
;
k
<=
ksize2
;
k
++
)
{
v_float32
k1
=
vx_setall_f32
(
ky
[
k
]);
s0
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
)
+
vx_load
(
src
[
-
k
]
+
i
),
k1
,
s0
);
s1
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
+
v_float32
::
nlanes
)
+
vx_load
(
src
[
-
k
]
+
i
+
v_float32
::
nlanes
),
k1
,
s1
);
s2
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
+
2
*
v_float32
::
nlanes
)
+
vx_load
(
src
[
-
k
]
+
i
+
2
*
v_float32
::
nlanes
),
k1
,
s2
);
s3
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
+
3
*
v_float32
::
nlanes
)
+
vx_load
(
src
[
-
k
]
+
i
+
3
*
v_float32
::
nlanes
),
k1
,
s3
);
}
v_store
(
dst
+
i
,
s0
);
v_store
(
dst
+
i
+
v_float32
::
nlanes
,
s1
);
v_store
(
dst
+
i
+
2
*
v_float32
::
nlanes
,
s2
);
v_store
(
dst
+
i
+
3
*
v_float32
::
nlanes
,
s3
);
}
if
(
i
<=
width
-
2
*
v_float32
::
nlanes
)
{
v_float32
s0
=
v_muladd
(
vx_load
(
src
[
0
]
+
i
),
k0
,
d4
);
v_float32
s1
=
v_muladd
(
vx_load
(
src
[
0
]
+
i
+
v_float32
::
nlanes
),
k0
,
d4
);
for
(
k
=
1
;
k
<=
ksize2
;
k
++
)
{
v_float32
k1
=
vx_setall_f32
(
ky
[
k
]);
s0
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
)
+
vx_load
(
src
[
-
k
]
+
i
),
k1
,
s0
);
s1
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
+
v_float32
::
nlanes
)
+
vx_load
(
src
[
-
k
]
+
i
+
v_float32
::
nlanes
),
k1
,
s1
);
}
v_store
(
dst
+
i
,
s0
);
v_store
(
dst
+
i
+
v_float32
::
nlanes
,
s1
);
i
+=
2
*
v_float32
::
nlanes
;
}
if
(
i
<=
width
-
v_float32
::
nlanes
)
{
v_float32
s0
=
v_muladd
(
vx_load
(
src
[
0
]
+
i
),
vx_setall_f32
(
ky
[
0
])
,
d4
);
v_float32
s0
=
v_muladd
(
vx_load
(
src
[
0
]
+
i
),
k0
,
d4
);
for
(
k
=
1
;
k
<=
ksize2
;
k
++
)
s0
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
)
+
vx_load
(
src
[
-
k
]
+
i
),
vx_setall_f32
(
ky
[
k
]),
s0
);
v_store
(
dst
+
i
,
s0
);
i
+=
v_float32
::
nlanes
;
}
}
else
...
...
@@ -1702,16 +1960,53 @@ struct SymmColumnVec_32f
if
(
haveAVX2
)
return
SymmColumnVec_32f_Unsymm_AVX
(
src
,
ky
,
dst
,
delta
,
width
,
ksize2
);
#endif
CV_DbgAssert
(
ksize2
>
0
);
const
v_float32
d4
=
vx_setall_f32
(
delta
);
for
(
;
i
<=
width
-
v_float32
::
nlanes
;
i
+=
v_float32
::
nlanes
)
const
v_float32
k1
=
vx_setall_f32
(
ky
[
1
]);
for
(
;
i
<=
width
-
4
*
v_float32
::
nlanes
;
i
+=
4
*
v_float32
::
nlanes
)
{
v_float32
s0
=
d4
;
for
(
k
=
1
;
k
<=
ksize2
;
k
++
)
v_float32
s0
=
v_muladd
(
vx_load
(
src
[
1
]
+
i
)
-
vx_load
(
src
[
-
1
]
+
i
),
k1
,
d4
);
v_float32
s1
=
v_muladd
(
vx_load
(
src
[
1
]
+
i
+
v_float32
::
nlanes
)
-
vx_load
(
src
[
-
1
]
+
i
+
v_float32
::
nlanes
),
k1
,
d4
);
v_float32
s2
=
v_muladd
(
vx_load
(
src
[
1
]
+
i
+
2
*
v_float32
::
nlanes
)
-
vx_load
(
src
[
-
1
]
+
i
+
2
*
v_float32
::
nlanes
),
k1
,
d4
);
v_float32
s3
=
v_muladd
(
vx_load
(
src
[
1
]
+
i
+
3
*
v_float32
::
nlanes
)
-
vx_load
(
src
[
-
1
]
+
i
+
3
*
v_float32
::
nlanes
),
k1
,
d4
);
for
(
k
=
2
;
k
<=
ksize2
;
k
++
)
{
v_float32
k2
=
vx_setall_f32
(
ky
[
k
]);
s0
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
)
-
vx_load
(
src
[
-
k
]
+
i
),
k2
,
s0
);
s1
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
+
v_float32
::
nlanes
)
-
vx_load
(
src
[
-
k
]
+
i
+
v_float32
::
nlanes
),
k2
,
s1
);
s2
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
+
2
*
v_float32
::
nlanes
)
-
vx_load
(
src
[
-
k
]
+
i
+
2
*
v_float32
::
nlanes
),
k2
,
s2
);
s3
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
+
3
*
v_float32
::
nlanes
)
-
vx_load
(
src
[
-
k
]
+
i
+
3
*
v_float32
::
nlanes
),
k2
,
s3
);
}
v_store
(
dst
+
i
,
s0
);
v_store
(
dst
+
i
+
v_float32
::
nlanes
,
s1
);
v_store
(
dst
+
i
+
2
*
v_float32
::
nlanes
,
s2
);
v_store
(
dst
+
i
+
3
*
v_float32
::
nlanes
,
s3
);
}
if
(
i
<=
width
-
2
*
v_float32
::
nlanes
)
{
v_float32
s0
=
v_muladd
(
vx_load
(
src
[
1
]
+
i
)
-
vx_load
(
src
[
-
1
]
+
i
),
k1
,
d4
);
v_float32
s1
=
v_muladd
(
vx_load
(
src
[
1
]
+
i
+
v_float32
::
nlanes
)
-
vx_load
(
src
[
-
1
]
+
i
+
v_float32
::
nlanes
),
k1
,
d4
);
for
(
k
=
2
;
k
<=
ksize2
;
k
++
)
{
v_float32
k2
=
vx_setall_f32
(
ky
[
k
]);
s0
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
)
-
vx_load
(
src
[
-
k
]
+
i
),
k2
,
s0
);
s1
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
+
v_float32
::
nlanes
)
-
vx_load
(
src
[
-
k
]
+
i
+
v_float32
::
nlanes
),
k2
,
s1
);
}
v_store
(
dst
+
i
,
s0
);
v_store
(
dst
+
i
+
v_float32
::
nlanes
,
s1
);
i
+=
2
*
v_float32
::
nlanes
;
}
if
(
i
<=
width
-
v_float32
::
nlanes
)
{
v_float32
s0
=
v_muladd
(
vx_load
(
src
[
1
]
+
i
)
-
vx_load
(
src
[
-
1
]
+
i
),
k1
,
d4
);
for
(
k
=
2
;
k
<=
ksize2
;
k
++
)
s0
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
)
-
vx_load
(
src
[
-
k
]
+
i
),
vx_setall_f32
(
ky
[
k
]),
s0
);
v_store
(
dst
+
i
,
s0
);
i
+=
v_float32
::
nlanes
;
}
}
vx_cleanup
();
return
i
;
}
...
...
@@ -1748,9 +2043,24 @@ struct SymmColumnSmallVec_32f
{
if
(
fabs
(
ky
[
0
])
==
2
&&
ky
[
1
]
==
1
)
{
#if CV_FMA3 || CV_AVX2
v_float32
k0
=
vx_setall_f32
(
ky
[
0
]);
for
(
;
i
<=
width
-
v_float32
::
nlanes
;
i
+=
v_float32
::
nlanes
)
for
(
;
i
<=
width
-
v_float32
::
nlanes
;
i
+=
v_float32
::
nlanes
)
v_store
(
dst
+
i
,
v_muladd
(
vx_load
(
S1
+
i
),
k0
,
vx_load
(
S0
+
i
)
+
vx_load
(
S2
+
i
)
+
d4
));
#else
if
(
ky
[
0
]
>
0
)
for
(
;
i
<=
width
-
v_float32
::
nlanes
;
i
+=
v_float32
::
nlanes
)
{
v_float32
x
=
vx_load
(
S1
+
i
);
v_store
(
dst
+
i
,
vx_load
(
S0
+
i
)
+
vx_load
(
S2
+
i
)
+
d4
+
(
x
+
x
));
}
else
for
(
;
i
<=
width
-
v_float32
::
nlanes
;
i
+=
v_float32
::
nlanes
)
{
v_float32
x
=
vx_load
(
S1
+
i
);
v_store
(
dst
+
i
,
vx_load
(
S0
+
i
)
+
vx_load
(
S2
+
i
)
+
d4
-
(
x
+
x
));
}
#endif
}
else
{
...
...
@@ -1776,6 +2086,7 @@ struct SymmColumnSmallVec_32f
}
}
vx_cleanup
();
return
i
;
}
...
...
@@ -1804,19 +2115,27 @@ struct FilterVec_8u
int
operator
()(
const
uchar
**
src
,
uchar
*
dst
,
int
width
)
const
{
CV_DbgAssert
(
_nz
>
0
);
const
float
*
kf
=
(
const
float
*
)
&
coeffs
[
0
];
int
i
=
0
,
k
,
nz
=
_nz
;
v_float32
d4
=
vx_setall_f32
(
delta
);
v_float32
f0
=
vx_setall_f32
(
kf
[
0
]);
for
(
;
i
<=
width
-
v_uint8
::
nlanes
;
i
+=
v_uint8
::
nlanes
)
{
v_float32
s0
=
d4
,
s1
=
d4
,
s2
=
d4
,
s3
=
d4
;
for
(
k
=
0
;
k
<
nz
;
k
++
)
v_uint16
xl
,
xh
;
v_expand
(
vx_load
(
src
[
0
]
+
i
),
xl
,
xh
);
v_uint32
x0
,
x1
,
x2
,
x3
;
v_expand
(
xl
,
x0
,
x1
);
v_expand
(
xh
,
x2
,
x3
);
v_float32
s0
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
x0
)),
f0
,
d4
);
v_float32
s1
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
x1
)),
f0
,
d4
);
v_float32
s2
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
x2
)),
f0
,
d4
);
v_float32
s3
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
x3
)),
f0
,
d4
);
for
(
k
=
1
;
k
<
nz
;
k
++
)
{
v_float32
f
=
vx_setall_f32
(
kf
[
k
]);
v_uint16
xl
,
xh
;
v_expand
(
vx_load
(
src
[
k
]
+
i
),
xl
,
xh
);
v_uint32
x0
,
x1
,
x2
,
x3
;
v_expand
(
xl
,
x0
,
x1
);
v_expand
(
xh
,
x2
,
x3
);
s0
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
x0
)),
f
,
s0
);
...
...
@@ -1828,11 +2147,13 @@ struct FilterVec_8u
}
if
(
i
<=
width
-
v_uint16
::
nlanes
)
{
v_float32
s0
=
d4
,
s1
=
d4
;
for
(
k
=
0
;
k
<
nz
;
k
++
)
v_uint32
x0
,
x1
;
v_expand
(
vx_load_expand
(
src
[
0
]
+
i
),
x0
,
x1
);
v_float32
s0
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
x0
)),
f0
,
d4
);
v_float32
s1
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
x1
)),
f0
,
d4
);
for
(
k
=
1
;
k
<
nz
;
k
++
)
{
v_float32
f
=
vx_setall_f32
(
kf
[
k
]);
v_uint32
x0
,
x1
;
v_expand
(
vx_load_expand
(
src
[
k
]
+
i
),
x0
,
x1
);
s0
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
x0
)),
f
,
s0
);
s1
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
x1
)),
f
,
s1
);
...
...
@@ -1846,8 +2167,8 @@ struct FilterVec_8u
if
(
i
<=
width
-
v_int32x4
::
nlanes
)
#endif
{
v_float32x4
s0
=
v_
setall_f32
(
delta
);
for
(
k
=
0
;
k
<
nz
;
k
++
)
v_float32x4
s0
=
v_
muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
v_load_expand_q
(
src
[
0
]
+
i
))),
v_setall_f32
(
kf
[
0
]),
v_setall_f32
(
delta
)
);
for
(
k
=
1
;
k
<
nz
;
k
++
)
s0
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
v_load_expand_q
(
src
[
k
]
+
i
))),
v_setall_f32
(
kf
[
k
]),
s0
);
v_int32x4
s32
=
v_round
(
s0
);
v_int16x8
s16
=
v_pack
(
s32
,
s32
);
...
...
@@ -1855,6 +2176,7 @@ struct FilterVec_8u
i
+=
v_int32x4
::
nlanes
;
}
vx_cleanup
();
return
i
;
}
...
...
@@ -1879,18 +2201,24 @@ struct FilterVec_8u16s
int
operator
()(
const
uchar
**
src
,
uchar
*
_dst
,
int
width
)
const
{
CV_DbgAssert
(
_nz
>
0
);
const
float
*
kf
=
(
const
float
*
)
&
coeffs
[
0
];
short
*
dst
=
(
short
*
)
_dst
;
int
i
=
0
,
k
,
nz
=
_nz
;
v_float32
d4
=
vx_setall_f32
(
delta
);
v_float32
f0
=
vx_setall_f32
(
kf
[
0
]);
for
(
;
i
<=
width
-
v_uint8
::
nlanes
;
i
+=
v_uint8
::
nlanes
)
{
v_float32
s0
=
d4
,
s1
=
d4
,
s2
=
d4
,
s3
=
d4
;
for
(
k
=
0
;
k
<
nz
;
k
++
)
v_uint16
xl
,
xh
;
v_expand
(
vx_load
(
src
[
0
]
+
i
),
xl
,
xh
);
v_float32
s0
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
v_expand_low
(
xl
))),
f0
,
d4
);
v_float32
s1
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
v_expand_high
(
xl
))),
f0
,
d4
);
v_float32
s2
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
v_expand_low
(
xh
))),
f0
,
d4
);
v_float32
s3
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
v_expand_high
(
xh
))),
f0
,
d4
);
for
(
k
=
1
;
k
<
nz
;
k
++
)
{
v_float32
f
=
vx_setall_f32
(
kf
[
k
]);
v_uint16
xl
,
xh
;
v_expand
(
vx_load
(
src
[
k
]
+
i
),
xl
,
xh
);
s0
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
v_expand_low
(
xl
))),
f
,
s0
);
s1
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
v_expand_high
(
xl
))),
f
,
s1
);
...
...
@@ -1902,11 +2230,13 @@ struct FilterVec_8u16s
}
if
(
i
<=
width
-
v_uint16
::
nlanes
)
{
v_float32
s0
=
d4
,
s1
=
d4
;
for
(
k
=
0
;
k
<
nz
;
k
++
)
v_uint16
x
=
vx_load_expand
(
src
[
0
]
+
i
);
v_float32
s0
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
v_expand_low
(
x
))),
f0
,
d4
);
v_float32
s1
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
v_expand_high
(
x
))),
f0
,
d4
);
for
(
k
=
1
;
k
<
nz
;
k
++
)
{
v_float32
f
=
vx_setall_f32
(
kf
[
k
]);
v_uint16
x
=
vx_load_expand
(
src
[
k
]
+
i
);
x
=
vx_load_expand
(
src
[
k
]
+
i
);
s0
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
v_expand_low
(
x
))),
f
,
s0
);
s1
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
v_expand_high
(
x
))),
f
,
s1
);
}
...
...
@@ -1915,13 +2245,14 @@ struct FilterVec_8u16s
}
if
(
i
<=
width
-
v_int32
::
nlanes
)
{
v_float32
s0
=
d4
;
for
(
k
=
0
;
k
<
nz
;
k
++
)
v_float32
s0
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vx_load_expand_q
(
src
[
0
]
+
i
))),
f0
,
d4
)
;
for
(
k
=
1
;
k
<
nz
;
k
++
)
s0
=
v_muladd
(
v_cvt_f32
(
v_reinterpret_as_s32
(
vx_load_expand_q
(
src
[
k
]
+
i
))),
vx_setall_f32
(
kf
[
k
]),
s0
);
v_pack_store
(
dst
+
i
,
v_round
(
s0
));
i
+=
v_int32
::
nlanes
;
}
vx_cleanup
();
return
i
;
}
...
...
@@ -1950,14 +2281,50 @@ struct FilterVec_32f
int
i
=
0
,
k
,
nz
=
_nz
;
v_float32
d4
=
vx_setall_f32
(
delta
);
for
(
;
i
<=
width
-
v_float32
::
nlanes
;
i
+=
v_float32
::
nlanes
)
v_float32
f0
=
vx_setall_f32
(
kf
[
0
]);
for
(
;
i
<=
width
-
4
*
v_float32
::
nlanes
;
i
+=
4
*
v_float32
::
nlanes
)
{
v_float32
s0
=
d4
;
for
(
k
=
0
;
k
<
nz
;
k
++
)
v_float32
s0
=
v_muladd
(
vx_load
(
src
[
0
]
+
i
),
f0
,
d4
);
v_float32
s1
=
v_muladd
(
vx_load
(
src
[
0
]
+
i
+
v_float32
::
nlanes
),
f0
,
d4
);
v_float32
s2
=
v_muladd
(
vx_load
(
src
[
0
]
+
i
+
2
*
v_float32
::
nlanes
),
f0
,
d4
);
v_float32
s3
=
v_muladd
(
vx_load
(
src
[
0
]
+
i
+
3
*
v_float32
::
nlanes
),
f0
,
d4
);
for
(
k
=
1
;
k
<
nz
;
k
++
)
{
v_float32
f1
=
vx_setall_f32
(
kf
[
k
]);
s0
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
),
f1
,
s0
);
s1
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
+
v_float32
::
nlanes
),
f1
,
s1
);
s2
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
+
2
*
v_float32
::
nlanes
),
f1
,
s2
);
s3
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
+
3
*
v_float32
::
nlanes
),
f1
,
s3
);
}
v_store
(
dst
+
i
,
s0
);
v_store
(
dst
+
i
+
v_float32
::
nlanes
,
s1
);
v_store
(
dst
+
i
+
2
*
v_float32
::
nlanes
,
s2
);
v_store
(
dst
+
i
+
3
*
v_float32
::
nlanes
,
s3
);
}
if
(
i
<=
width
-
2
*
v_float32
::
nlanes
)
{
v_float32
s0
=
v_muladd
(
vx_load
(
src
[
0
]
+
i
),
f0
,
d4
);
v_float32
s1
=
v_muladd
(
vx_load
(
src
[
0
]
+
i
+
v_float32
::
nlanes
),
f0
,
d4
);
for
(
k
=
1
;
k
<
nz
;
k
++
)
{
v_float32
f1
=
vx_setall_f32
(
kf
[
k
]);
s0
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
),
f1
,
s0
);
s1
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
+
v_float32
::
nlanes
),
f1
,
s1
);
}
v_store
(
dst
+
i
,
s0
);
v_store
(
dst
+
i
+
v_float32
::
nlanes
,
s1
);
i
+=
2
*
v_float32
::
nlanes
;
}
if
(
i
<=
width
-
v_float32
::
nlanes
)
{
v_float32
s0
=
v_muladd
(
vx_load
(
src
[
0
]
+
i
),
f0
,
d4
);
for
(
k
=
1
;
k
<
nz
;
k
++
)
s0
=
v_muladd
(
vx_load
(
src
[
k
]
+
i
),
vx_setall_f32
(
kf
[
k
]),
s0
);
v_store
(
dst
+
i
,
s0
);
i
+=
v_float32
::
nlanes
;
}
vx_cleanup
();
return
i
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment