Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
43d92560
Commit
43d92560
authored
Sep 05, 2018
by
Vitaly Tuzov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Replaced core module calls to universal intrinsics with wide universal intrinsics
parent
803ff64b
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
204 additions
and
216 deletions
+204
-216
copy.cpp
modules/core/src/copy.cpp
+16
-14
lapack.cpp
modules/core/src/lapack.cpp
+47
-42
mathfuncs.cpp
modules/core/src/mathfuncs.cpp
+62
-59
matmul.cpp
modules/core/src/matmul.cpp
+79
-101
No files found.
modules/core/src/copy.cpp
View file @
43d92560
...
...
@@ -90,20 +90,21 @@ copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mste
const
uchar
*
src
=
(
const
uchar
*
)
_src
;
uchar
*
dst
=
(
uchar
*
)
_dst
;
int
x
=
0
;
#if CV_SIMD
128
#if CV_SIMD
{
v_uint8
x16
v_zero
=
v
_setzero_u8
();
v_uint8
v_zero
=
vx
_setzero_u8
();
for
(
;
x
<=
size
.
width
-
16
;
x
+=
16
)
for
(
;
x
<=
size
.
width
-
v_uint8
::
nlanes
;
x
+=
v_uint8
::
nlanes
)
{
v_uint8
x16
v_src
=
v
_load
(
src
+
x
),
v_dst
=
v
_load
(
dst
+
x
),
v_nmask
=
v
_load
(
mask
+
x
)
==
v_zero
;
v_uint8
v_src
=
vx
_load
(
src
+
x
),
v_dst
=
vx
_load
(
dst
+
x
),
v_nmask
=
vx
_load
(
mask
+
x
)
==
v_zero
;
v_dst
=
v_select
(
v_nmask
,
v_dst
,
v_src
);
v_store
(
dst
+
x
,
v_dst
);
}
}
vx_cleanup
();
#endif
for
(
;
x
<
size
.
width
;
x
++
)
if
(
mask
[
x
]
)
...
...
@@ -121,25 +122,26 @@ copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mst
const
ushort
*
src
=
(
const
ushort
*
)
_src
;
ushort
*
dst
=
(
ushort
*
)
_dst
;
int
x
=
0
;
#if CV_SIMD
128
#if CV_SIMD
{
v_uint8
x16
v_zero
=
v
_setzero_u8
();
v_uint8
v_zero
=
vx
_setzero_u8
();
for
(
;
x
<=
size
.
width
-
16
;
x
+=
16
)
for
(
;
x
<=
size
.
width
-
v_uint8
::
nlanes
;
x
+=
v_uint8
::
nlanes
)
{
v_uint16
x8
v_src1
=
v_load
(
src
+
x
),
v_src2
=
v_load
(
src
+
x
+
8
),
v_dst1
=
v_load
(
dst
+
x
),
v_dst2
=
v_load
(
dst
+
x
+
8
);
v_uint16
v_src1
=
vx_load
(
src
+
x
),
v_src2
=
vx_load
(
src
+
x
+
v_uint16
::
nlanes
),
v_dst1
=
vx_load
(
dst
+
x
),
v_dst2
=
vx_load
(
dst
+
x
+
v_uint16
::
nlanes
);
v_uint8
x16
v_nmask1
,
v_nmask2
;
v_uint8
x16
v_nmask
=
v
_load
(
mask
+
x
)
==
v_zero
;
v_uint8
v_nmask1
,
v_nmask2
;
v_uint8
v_nmask
=
vx
_load
(
mask
+
x
)
==
v_zero
;
v_zip
(
v_nmask
,
v_nmask
,
v_nmask1
,
v_nmask2
);
v_dst1
=
v_select
(
v_reinterpret_as_u16
(
v_nmask1
),
v_dst1
,
v_src1
);
v_dst2
=
v_select
(
v_reinterpret_as_u16
(
v_nmask2
),
v_dst2
,
v_src2
);
v_store
(
dst
+
x
,
v_dst1
);
v_store
(
dst
+
x
+
8
,
v_dst2
);
v_store
(
dst
+
x
+
v_uint16
::
nlanes
,
v_dst2
);
}
}
vx_cleanup
();
#endif
for
(
;
x
<
size
.
width
;
x
++
)
if
(
mask
[
x
]
)
...
...
modules/core/src/lapack.cpp
View file @
43d92560
...
...
@@ -277,40 +277,42 @@ template<typename T> struct VBLAS
int
givensx
(
T
*
,
T
*
,
int
,
T
,
T
,
T
*
,
T
*
)
const
{
return
0
;
}
};
#if CV_SIMD
128
#if CV_SIMD
template
<>
inline
int
VBLAS
<
float
>::
dot
(
const
float
*
a
,
const
float
*
b
,
int
n
,
float
*
result
)
const
{
if
(
n
<
8
)
if
(
n
<
2
*
v_float32
::
nlanes
)
return
0
;
int
k
=
0
;
v_float32
x4
s0
=
v
_setzero_f32
();
for
(
;
k
<=
n
-
v_float32
x4
::
nlanes
;
k
+=
v_float32x4
::
nlanes
)
v_float32
s0
=
vx
_setzero_f32
();
for
(
;
k
<=
n
-
v_float32
::
nlanes
;
k
+=
v_float32
::
nlanes
)
{
v_float32
x4
a0
=
v
_load
(
a
+
k
);
v_float32
x4
b0
=
v
_load
(
b
+
k
);
v_float32
a0
=
vx
_load
(
a
+
k
);
v_float32
b0
=
vx
_load
(
b
+
k
);
s0
+=
a0
*
b0
;
}
*
result
=
v_reduce_sum
(
s0
);
vx_cleanup
();
return
k
;
}
template
<>
inline
int
VBLAS
<
float
>::
givens
(
float
*
a
,
float
*
b
,
int
n
,
float
c
,
float
s
)
const
{
if
(
n
<
4
)
if
(
n
<
v_float32
::
nlanes
)
return
0
;
int
k
=
0
;
v_float32
x4
c4
=
v_setall_f32
(
c
),
s4
=
v
_setall_f32
(
s
);
for
(
;
k
<=
n
-
v_float32
x4
::
nlanes
;
k
+=
v_float32x4
::
nlanes
)
v_float32
c4
=
vx_setall_f32
(
c
),
s4
=
vx
_setall_f32
(
s
);
for
(
;
k
<=
n
-
v_float32
::
nlanes
;
k
+=
v_float32
::
nlanes
)
{
v_float32
x4
a0
=
v
_load
(
a
+
k
);
v_float32
x4
b0
=
v
_load
(
b
+
k
);
v_float32
x4
t0
=
(
a0
*
c4
)
+
(
b0
*
s4
);
v_float32
x4
t1
=
(
b0
*
c4
)
-
(
a0
*
s4
);
v_float32
a0
=
vx
_load
(
a
+
k
);
v_float32
b0
=
vx
_load
(
b
+
k
);
v_float32
t0
=
(
a0
*
c4
)
+
(
b0
*
s4
);
v_float32
t1
=
(
b0
*
c4
)
-
(
a0
*
s4
);
v_store
(
a
+
k
,
t0
);
v_store
(
b
+
k
,
t1
);
}
vx_cleanup
();
return
k
;
}
...
...
@@ -318,17 +320,17 @@ template<> inline int VBLAS<float>::givens(float* a, float* b, int n, float c, f
template
<>
inline
int
VBLAS
<
float
>::
givensx
(
float
*
a
,
float
*
b
,
int
n
,
float
c
,
float
s
,
float
*
anorm
,
float
*
bnorm
)
const
{
if
(
n
<
4
)
if
(
n
<
v_float32
::
nlanes
)
return
0
;
int
k
=
0
;
v_float32
x4
c4
=
v_setall_f32
(
c
),
s4
=
v
_setall_f32
(
s
);
v_float32
x4
sa
=
v_setzero_f32
(),
sb
=
v
_setzero_f32
();
for
(
;
k
<=
n
-
v_float32
x4
::
nlanes
;
k
+=
v_float32x4
::
nlanes
)
v_float32
c4
=
vx_setall_f32
(
c
),
s4
=
vx
_setall_f32
(
s
);
v_float32
sa
=
vx_setzero_f32
(),
sb
=
vx
_setzero_f32
();
for
(
;
k
<=
n
-
v_float32
::
nlanes
;
k
+=
v_float32
::
nlanes
)
{
v_float32
x4
a0
=
v
_load
(
a
+
k
);
v_float32
x4
b0
=
v
_load
(
b
+
k
);
v_float32
x4
t0
=
(
a0
*
c4
)
+
(
b0
*
s4
);
v_float32
x4
t1
=
(
b0
*
c4
)
-
(
a0
*
s4
);
v_float32
a0
=
vx
_load
(
a
+
k
);
v_float32
b0
=
vx
_load
(
b
+
k
);
v_float32
t0
=
(
a0
*
c4
)
+
(
b0
*
s4
);
v_float32
t1
=
(
b0
*
c4
)
-
(
a0
*
s4
);
v_store
(
a
+
k
,
t0
);
v_store
(
b
+
k
,
t1
);
sa
+=
t0
+
t0
;
...
...
@@ -336,26 +338,28 @@ template<> inline int VBLAS<float>::givensx(float* a, float* b, int n, float c,
}
*
anorm
=
v_reduce_sum
(
sa
);
*
bnorm
=
v_reduce_sum
(
sb
);
vx_cleanup
();
return
k
;
}
#if CV_SIMD
128
_64F
#if CV_SIMD_64F
template
<>
inline
int
VBLAS
<
double
>::
dot
(
const
double
*
a
,
const
double
*
b
,
int
n
,
double
*
result
)
const
{
if
(
n
<
4
)
if
(
n
<
2
*
v_float64
::
nlanes
)
return
0
;
int
k
=
0
;
v_float64
x2
s0
=
v
_setzero_f64
();
for
(
;
k
<=
n
-
v_float64
x2
::
nlanes
;
k
+=
v_float64x2
::
nlanes
)
v_float64
s0
=
vx
_setzero_f64
();
for
(
;
k
<=
n
-
v_float64
::
nlanes
;
k
+=
v_float64
::
nlanes
)
{
v_float64
x2
a0
=
v
_load
(
a
+
k
);
v_float64
x2
b0
=
v
_load
(
b
+
k
);
v_float64
a0
=
vx
_load
(
a
+
k
);
v_float64
b0
=
vx
_load
(
b
+
k
);
s0
+=
a0
*
b0
;
}
double
sbuf
[
2
];
v_store
(
sbuf
,
s0
);
*
result
=
sbuf
[
0
]
+
sbuf
[
1
];
vx_cleanup
();
return
k
;
}
...
...
@@ -363,16 +367,17 @@ template<> inline int VBLAS<double>::dot(const double* a, const double* b, int n
template
<>
inline
int
VBLAS
<
double
>::
givens
(
double
*
a
,
double
*
b
,
int
n
,
double
c
,
double
s
)
const
{
int
k
=
0
;
v_float64
x2
c2
=
v_setall_f64
(
c
),
s2
=
v
_setall_f64
(
s
);
for
(
;
k
<=
n
-
v_float64
x2
::
nlanes
;
k
+=
v_float64x2
::
nlanes
)
v_float64
c2
=
vx_setall_f64
(
c
),
s2
=
vx
_setall_f64
(
s
);
for
(
;
k
<=
n
-
v_float64
::
nlanes
;
k
+=
v_float64
::
nlanes
)
{
v_float64
x2
a0
=
v
_load
(
a
+
k
);
v_float64
x2
b0
=
v
_load
(
b
+
k
);
v_float64
x2
t0
=
(
a0
*
c2
)
+
(
b0
*
s2
);
v_float64
x2
t1
=
(
b0
*
c2
)
-
(
a0
*
s2
);
v_float64
a0
=
vx
_load
(
a
+
k
);
v_float64
b0
=
vx
_load
(
b
+
k
);
v_float64
t0
=
(
a0
*
c2
)
+
(
b0
*
s2
);
v_float64
t1
=
(
b0
*
c2
)
-
(
a0
*
s2
);
v_store
(
a
+
k
,
t0
);
v_store
(
b
+
k
,
t1
);
}
vx_cleanup
();
return
k
;
}
...
...
@@ -381,14 +386,14 @@ template<> inline int VBLAS<double>::givensx(double* a, double* b, int n, double
double
*
anorm
,
double
*
bnorm
)
const
{
int
k
=
0
;
v_float64
x2
c2
=
v_setall_f64
(
c
),
s2
=
v
_setall_f64
(
s
);
v_float64
x2
sa
=
v_setzero_f64
(),
sb
=
v
_setzero_f64
();
for
(
;
k
<=
n
-
v_float64
x2
::
nlanes
;
k
+=
v_float64x2
::
nlanes
)
v_float64
c2
=
vx_setall_f64
(
c
),
s2
=
vx
_setall_f64
(
s
);
v_float64
sa
=
vx_setzero_f64
(),
sb
=
vx
_setzero_f64
();
for
(
;
k
<=
n
-
v_float64
::
nlanes
;
k
+=
v_float64
::
nlanes
)
{
v_float64
x2
a0
=
v
_load
(
a
+
k
);
v_float64
x2
b0
=
v
_load
(
b
+
k
);
v_float64
x2
t0
=
(
a0
*
c2
)
+
(
b0
*
s2
);
v_float64
x2
t1
=
(
b0
*
c2
)
-
(
a0
*
s2
);
v_float64
a0
=
vx
_load
(
a
+
k
);
v_float64
b0
=
vx
_load
(
b
+
k
);
v_float64
t0
=
(
a0
*
c2
)
+
(
b0
*
s2
);
v_float64
t1
=
(
b0
*
c2
)
-
(
a0
*
s2
);
v_store
(
a
+
k
,
t0
);
v_store
(
b
+
k
,
t1
);
sa
+=
t0
*
t0
;
...
...
@@ -401,8 +406,8 @@ template<> inline int VBLAS<double>::givensx(double* a, double* b, int n, double
*
bnorm
=
bbuf
[
0
]
+
bbuf
[
1
];
return
k
;
}
#endif //CV_SIMD
128
_64F
#endif //CV_SIMD
128
#endif //CV_SIMD_64F
#endif //CV_SIMD
template
<
typename
_Tp
>
void
JacobiSVDImpl_
(
_Tp
*
At
,
size_t
astep
,
_Tp
*
_W
,
_Tp
*
Vt
,
size_t
vstep
,
...
...
modules/core/src/mathfuncs.cpp
View file @
43d92560
...
...
@@ -606,17 +606,15 @@ void polarToCart( InputArray src1, InputArray src2,
{
k
=
0
;
#if CV_SIMD128
if
(
hasSIMD128
()
)
#if CV_SIMD
int
cWidth
=
v_float32
::
nlanes
;
for
(
;
k
<=
len
-
cWidth
;
k
+=
cWidth
)
{
int
cWidth
=
v_float32x4
::
nlanes
;
for
(
;
k
<=
len
-
cWidth
;
k
+=
cWidth
)
{
v_float32x4
v_m
=
v_load
(
mag
+
k
);
v_store
(
x
+
k
,
v_load
(
x
+
k
)
*
v_m
);
v_store
(
y
+
k
,
v_load
(
y
+
k
)
*
v_m
);
}
v_float32
v_m
=
vx_load
(
mag
+
k
);
v_store
(
x
+
k
,
vx_load
(
x
+
k
)
*
v_m
);
v_store
(
y
+
k
,
vx_load
(
y
+
k
)
*
v_m
);
}
vx_cleanup
();
#endif
for
(
;
k
<
len
;
k
++
)
...
...
@@ -735,7 +733,7 @@ struct iPow_SIMD
}
};
#if CV_SIMD
128
#if CV_SIMD
template
<>
struct
iPow_SIMD
<
uchar
,
int
>
...
...
@@ -743,13 +741,13 @@ struct iPow_SIMD<uchar, int>
int
operator
()
(
const
uchar
*
src
,
uchar
*
dst
,
int
len
,
int
power
)
{
int
i
=
0
;
v_uint32
x4
v_1
=
v
_setall_u32
(
1u
);
v_uint32
v_1
=
vx
_setall_u32
(
1u
);
for
(
;
i
<=
len
-
8
;
i
+=
8
)
for
(
;
i
<=
len
-
v_uint16
::
nlanes
;
i
+=
v_uint16
::
nlanes
)
{
v_uint32
x4
v_a1
=
v_1
,
v_a2
=
v_1
;
v_uint16
x8
v
=
v
_load_expand
(
src
+
i
);
v_uint32
x4
v_b1
,
v_b2
;
v_uint32
v_a1
=
v_1
,
v_a2
=
v_1
;
v_uint16
v
=
vx
_load_expand
(
src
+
i
);
v_uint32
v_b1
,
v_b2
;
v_expand
(
v
,
v_b1
,
v_b2
);
int
p
=
power
;
...
...
@@ -771,6 +769,7 @@ struct iPow_SIMD<uchar, int>
v
=
v_pack
(
v_a1
,
v_a2
);
v_pack_store
(
dst
+
i
,
v
);
}
vx_cleanup
();
return
i
;
}
...
...
@@ -782,13 +781,13 @@ struct iPow_SIMD<schar, int>
int
operator
()
(
const
schar
*
src
,
schar
*
dst
,
int
len
,
int
power
)
{
int
i
=
0
;
v_int32
x4
v_1
=
v
_setall_s32
(
1
);
v_int32
v_1
=
vx
_setall_s32
(
1
);
for
(
;
i
<=
len
-
8
;
i
+=
8
)
for
(
;
i
<=
len
-
v_int16
::
nlanes
;
i
+=
v_int16
::
nlanes
)
{
v_int32
x4
v_a1
=
v_1
,
v_a2
=
v_1
;
v_int16
x8
v
=
v
_load_expand
(
src
+
i
);
v_int32
x4
v_b1
,
v_b2
;
v_int32
v_a1
=
v_1
,
v_a2
=
v_1
;
v_int16
v
=
vx
_load_expand
(
src
+
i
);
v_int32
v_b1
,
v_b2
;
v_expand
(
v
,
v_b1
,
v_b2
);
int
p
=
power
;
...
...
@@ -810,6 +809,7 @@ struct iPow_SIMD<schar, int>
v
=
v_pack
(
v_a1
,
v_a2
);
v_pack_store
(
dst
+
i
,
v
);
}
vx_cleanup
();
return
i
;
}
...
...
@@ -821,13 +821,13 @@ struct iPow_SIMD<ushort, int>
int
operator
()
(
const
ushort
*
src
,
ushort
*
dst
,
int
len
,
int
power
)
{
int
i
=
0
;
v_uint32
x4
v_1
=
v
_setall_u32
(
1u
);
v_uint32
v_1
=
vx
_setall_u32
(
1u
);
for
(
;
i
<=
len
-
8
;
i
+=
8
)
for
(
;
i
<=
len
-
v_uint16
::
nlanes
;
i
+=
v_uint16
::
nlanes
)
{
v_uint32
x4
v_a1
=
v_1
,
v_a2
=
v_1
;
v_uint16
x8
v
=
v
_load
(
src
+
i
);
v_uint32
x4
v_b1
,
v_b2
;
v_uint32
v_a1
=
v_1
,
v_a2
=
v_1
;
v_uint16
v
=
vx
_load
(
src
+
i
);
v_uint32
v_b1
,
v_b2
;
v_expand
(
v
,
v_b1
,
v_b2
);
int
p
=
power
;
...
...
@@ -849,6 +849,7 @@ struct iPow_SIMD<ushort, int>
v
=
v_pack
(
v_a1
,
v_a2
);
v_store
(
dst
+
i
,
v
);
}
vx_cleanup
();
return
i
;
}
...
...
@@ -860,13 +861,13 @@ struct iPow_SIMD<short, int>
int
operator
()
(
const
short
*
src
,
short
*
dst
,
int
len
,
int
power
)
{
int
i
=
0
;
v_int32
x4
v_1
=
v
_setall_s32
(
1
);
v_int32
v_1
=
vx
_setall_s32
(
1
);
for
(
;
i
<=
len
-
8
;
i
+=
8
)
for
(
;
i
<=
len
-
v_int16
::
nlanes
;
i
+=
v_int16
::
nlanes
)
{
v_int32
x4
v_a1
=
v_1
,
v_a2
=
v_1
;
v_int16
x8
v
=
v
_load
(
src
+
i
);
v_int32
x4
v_b1
,
v_b2
;
v_int32
v_a1
=
v_1
,
v_a2
=
v_1
;
v_int16
v
=
vx
_load
(
src
+
i
);
v_int32
v_b1
,
v_b2
;
v_expand
(
v
,
v_b1
,
v_b2
);
int
p
=
power
;
...
...
@@ -888,6 +889,7 @@ struct iPow_SIMD<short, int>
v
=
v_pack
(
v_a1
,
v_a2
);
v_store
(
dst
+
i
,
v
);
}
vx_cleanup
();
return
i
;
}
...
...
@@ -899,12 +901,12 @@ struct iPow_SIMD<int, int>
int
operator
()
(
const
int
*
src
,
int
*
dst
,
int
len
,
int
power
)
{
int
i
=
0
;
v_int32
x4
v_1
=
v
_setall_s32
(
1
);
v_int32
v_1
=
vx
_setall_s32
(
1
);
for
(
;
i
<=
len
-
8
;
i
+=
8
)
for
(
;
i
<=
len
-
v_int32
::
nlanes
*
2
;
i
+=
v_int32
::
nlanes
*
2
)
{
v_int32
x4
v_a1
=
v_1
,
v_a2
=
v_1
;
v_int32
x4
v_b1
=
v_load
(
src
+
i
),
v_b2
=
v_load
(
src
+
i
+
4
);
v_int32
v_a1
=
v_1
,
v_a2
=
v_1
;
v_int32
v_b1
=
vx_load
(
src
+
i
),
v_b2
=
vx_load
(
src
+
i
+
v_int32
::
nlanes
);
int
p
=
power
;
while
(
p
>
1
)
...
...
@@ -923,8 +925,9 @@ struct iPow_SIMD<int, int>
v_a2
*=
v_b2
;
v_store
(
dst
+
i
,
v_a1
);
v_store
(
dst
+
i
+
4
,
v_a2
);
v_store
(
dst
+
i
+
v_int32
::
nlanes
,
v_a2
);
}
vx_cleanup
();
return
i
;
}
...
...
@@ -936,12 +939,12 @@ struct iPow_SIMD<float, float>
int
operator
()
(
const
float
*
src
,
float
*
dst
,
int
len
,
int
power
)
{
int
i
=
0
;
v_float32
x4
v_1
=
v
_setall_f32
(
1.
f
);
v_float32
v_1
=
vx
_setall_f32
(
1.
f
);
for
(
;
i
<=
len
-
8
;
i
+=
8
)
for
(
;
i
<=
len
-
v_float32
::
nlanes
*
2
;
i
+=
v_float32
::
nlanes
*
2
)
{
v_float32
x4
v_a1
=
v_1
,
v_a2
=
v_1
;
v_float32
x4
v_b1
=
v_load
(
src
+
i
),
v_b2
=
v_load
(
src
+
i
+
4
);
v_float32
v_a1
=
v_1
,
v_a2
=
v_1
;
v_float32
v_b1
=
vx_load
(
src
+
i
),
v_b2
=
vx_load
(
src
+
i
+
v_float32
::
nlanes
);
int
p
=
std
::
abs
(
power
);
if
(
power
<
0
)
{
...
...
@@ -965,26 +968,27 @@ struct iPow_SIMD<float, float>
v_a2
*=
v_b2
;
v_store
(
dst
+
i
,
v_a1
);
v_store
(
dst
+
i
+
4
,
v_a2
);
v_store
(
dst
+
i
+
v_float32
::
nlanes
,
v_a2
);
}
vx_cleanup
();
return
i
;
}
};
#if CV_SIMD
128
_64F
#if CV_SIMD_64F
template
<>
struct
iPow_SIMD
<
double
,
double
>
{
int
operator
()
(
const
double
*
src
,
double
*
dst
,
int
len
,
int
power
)
{
int
i
=
0
;
v_float64
x2
v_1
=
v
_setall_f64
(
1.
);
v_float64
v_1
=
vx
_setall_f64
(
1.
);
for
(
;
i
<=
len
-
4
;
i
+=
4
)
for
(
;
i
<=
len
-
v_float64
::
nlanes
*
2
;
i
+=
v_float64
::
nlanes
*
2
)
{
v_float64
x2
v_a1
=
v_1
,
v_a2
=
v_1
;
v_float64
x2
v_b1
=
v_load
(
src
+
i
),
v_b2
=
v_load
(
src
+
i
+
2
);
v_float64
v_a1
=
v_1
,
v_a2
=
v_1
;
v_float64
v_b1
=
vx_load
(
src
+
i
),
v_b2
=
vx_load
(
src
+
i
+
v_float64
::
nlanes
);
int
p
=
std
::
abs
(
power
);
if
(
power
<
0
)
{
...
...
@@ -1008,8 +1012,9 @@ struct iPow_SIMD<double, double>
v_a2
*=
v_b2
;
v_store
(
dst
+
i
,
v_a1
);
v_store
(
dst
+
i
+
2
,
v_a2
);
v_store
(
dst
+
i
+
v_float64
::
nlanes
,
v_a2
);
}
vx_cleanup
();
return
i
;
}
...
...
@@ -1594,9 +1599,9 @@ void patchNaNs( InputOutputArray _a, double _val )
Cv32suf
val
;
val
.
f
=
(
float
)
_val
;
#if CV_SIMD
128
v_int32
x4
v_mask1
=
v_setall_s32
(
0x7fffffff
),
v_mask2
=
v
_setall_s32
(
0x7f800000
);
v_int32
x4
v_val
=
v
_setall_s32
(
val
.
i
);
#if CV_SIMD
v_int32
v_mask1
=
vx_setall_s32
(
0x7fffffff
),
v_mask2
=
vx
_setall_s32
(
0x7f800000
);
v_int32
v_val
=
vx
_setall_s32
(
val
.
i
);
#endif
for
(
size_t
i
=
0
;
i
<
it
.
nplanes
;
i
++
,
++
it
)
...
...
@@ -1604,18 +1609,16 @@ void patchNaNs( InputOutputArray _a, double _val )
int
*
tptr
=
ptrs
[
0
];
size_t
j
=
0
;
#if CV_SIMD128
if
(
hasSIMD128
()
)
#if CV_SIMD
size_t
cWidth
=
(
size_t
)
v_int32
::
nlanes
;
for
(
;
j
+
cWidth
<=
len
;
j
+=
cWidth
)
{
size_t
cWidth
=
(
size_t
)
v_int32x4
::
nlanes
;
for
(
;
j
+
cWidth
<=
len
;
j
+=
cWidth
)
{
v_int32x4
v_src
=
v_load
(
tptr
+
j
);
v_int32x4
v_cmp_mask
=
v_mask2
<
(
v_src
&
v_mask1
);
v_int32x4
v_dst
=
v_select
(
v_cmp_mask
,
v_val
,
v_src
);
v_store
(
tptr
+
j
,
v_dst
);
}
v_int32
v_src
=
vx_load
(
tptr
+
j
);
v_int32
v_cmp_mask
=
v_mask2
<
(
v_src
&
v_mask1
);
v_int32
v_dst
=
v_select
(
v_cmp_mask
,
v_val
,
v_src
);
v_store
(
tptr
+
j
,
v_dst
);
}
vx_cleanup
();
#endif
for
(
;
j
<
len
;
j
++
)
...
...
modules/core/src/matmul.cpp
View file @
43d92560
...
...
@@ -2310,18 +2310,12 @@ static void scaleAdd_32f(const float* src1, const float* src2, float* dst,
{
float
alpha
=
*
_alpha
;
int
i
=
0
;
#if CV_SIMD128
if
(
hasSIMD128
())
{
v_float32x4
v_alpha
=
v_setall_f32
(
alpha
);
const
int
cWidth
=
v_float32x4
::
nlanes
;
for
(;
i
<=
len
-
cWidth
;
i
+=
cWidth
)
{
v_float32x4
v_src1
=
v_load
(
src1
+
i
);
v_float32x4
v_src2
=
v_load
(
src2
+
i
);
v_store
(
dst
+
i
,
(
v_src1
*
v_alpha
)
+
v_src2
);
}
}
#if CV_SIMD
v_float32
v_alpha
=
vx_setall_f32
(
alpha
);
const
int
cWidth
=
v_float32
::
nlanes
;
for
(;
i
<=
len
-
cWidth
;
i
+=
cWidth
)
v_store
(
dst
+
i
,
v_muladd
(
vx_load
(
src1
+
i
),
v_alpha
,
vx_load
(
src2
+
i
)));
vx_cleanup
();
#endif
for
(;
i
<
len
;
i
++
)
dst
[
i
]
=
src1
[
i
]
*
alpha
+
src2
[
i
];
...
...
@@ -2333,22 +2327,12 @@ static void scaleAdd_64f(const double* src1, const double* src2, double* dst,
{
double
alpha
=
*
_alpha
;
int
i
=
0
;
#if CV_SIMD128_64F
if
(
hasSIMD128
())
{
v_float64x2
a2
=
v_setall_f64
(
alpha
);
const
int
cWidth
=
v_float64x2
::
nlanes
;
for
(;
i
<=
len
-
cWidth
*
2
;
i
+=
cWidth
*
2
)
{
v_float64x2
x0
,
x1
,
y0
,
y1
,
t0
,
t1
;
x0
=
v_load
(
src1
+
i
);
x1
=
v_load
(
src1
+
i
+
cWidth
);
y0
=
v_load
(
src2
+
i
);
y1
=
v_load
(
src2
+
i
+
cWidth
);
t0
=
x0
*
a2
+
y0
;
t1
=
x1
*
a2
+
y1
;
v_store
(
dst
+
i
,
t0
);
v_store
(
dst
+
i
+
cWidth
,
t1
);
}
}
#if CV_SIMD_64F
v_float64
a2
=
vx_setall_f64
(
alpha
);
const
int
cWidth
=
v_float64
::
nlanes
;
for
(;
i
<=
len
-
cWidth
;
i
+=
cWidth
)
v_store
(
dst
+
i
,
v_muladd
(
vx_load
(
src1
+
i
),
a2
,
vx_load
(
src2
+
i
)));
vx_cleanup
();
#endif
for
(;
i
<
len
;
i
++
)
dst
[
i
]
=
src1
[
i
]
*
alpha
+
src2
[
i
];
...
...
@@ -3025,42 +3009,40 @@ static double dotProd_8u(const uchar* src1, const uchar* src2, int len)
#endif
int
i
=
0
;
#if CV_SIMD128
if
(
hasSIMD128
())
#if CV_SIMD
int
len0
=
len
&
-
v_uint16
::
nlanes
,
blockSize0
=
(
1
<<
15
),
blockSize
;
while
(
i
<
len0
)
{
int
len0
=
len
&
-
8
,
blockSize0
=
(
1
<<
15
),
blockSize
;
blockSize
=
std
::
min
(
len0
-
i
,
blockSize0
);
v_int32
v_sum
=
vx_setzero_s32
();
const
int
cWidth
=
v_uint16
::
nlanes
;
while
(
i
<
len0
)
int
j
=
0
;
for
(;
j
<=
blockSize
-
cWidth
*
2
;
j
+=
cWidth
*
2
)
{
blockSize
=
std
::
min
(
len0
-
i
,
blockSize0
)
;
v_
int32x4
v_sum
=
v_setzero_s32
(
);
const
int
cWidth
=
v_uint16x8
::
nlanes
;
v_uint16
v_src10
,
v_src20
,
v_src11
,
v_src21
;
v_
expand
(
vx_load
(
src1
+
j
),
v_src10
,
v_src11
);
v_expand
(
vx_load
(
src2
+
j
),
v_src20
,
v_src21
)
;
int
j
=
0
;
for
(;
j
<=
blockSize
-
cWidth
*
2
;
j
+=
cWidth
*
2
)
{
v_uint16x8
v_src10
,
v_src20
,
v_src11
,
v_src21
;
v_expand
(
v_load
(
src1
+
j
),
v_src10
,
v_src11
);
v_expand
(
v_load
(
src2
+
j
),
v_src20
,
v_src21
);
v_sum
+=
v_dotprod
(
v_reinterpret_as_s16
(
v_src10
),
v_reinterpret_as_s16
(
v_src20
));
v_sum
+=
v_dotprod
(
v_reinterpret_as_s16
(
v_src11
),
v_reinterpret_as_s16
(
v_src21
));
}
for
(;
j
<=
blockSize
-
cWidth
;
j
+=
cWidth
)
{
v_int16x8
v_src10
=
v_reinterpret_as_s16
(
v_load_expand
(
src1
+
j
));
v_int16x8
v_src20
=
v_reinterpret_as_s16
(
v_load_expand
(
src2
+
j
));
v_sum
+=
v_dotprod
(
v_reinterpret_as_s16
(
v_src10
),
v_reinterpret_as_s16
(
v_src20
));
v_sum
+=
v_dotprod
(
v_reinterpret_as_s16
(
v_src11
),
v_reinterpret_as_s16
(
v_src21
));
}
v_sum
+=
v_dotprod
(
v_src10
,
v_src20
);
}
r
+=
(
double
)
v_reduce_sum
(
v_sum
);
for
(;
j
<=
blockSize
-
cWidth
;
j
+=
cWidth
)
{
v_int16
v_src10
=
v_reinterpret_as_s16
(
vx_load_expand
(
src1
+
j
));
v_int16
v_src20
=
v_reinterpret_as_s16
(
vx_load_expand
(
src2
+
j
));
src1
+=
blockSize
;
src2
+=
blockSize
;
i
+=
blockSize
;
v_sum
+=
v_dotprod
(
v_src10
,
v_src20
);
}
r
+=
(
double
)
v_reduce_sum
(
v_sum
);
src1
+=
blockSize
;
src2
+=
blockSize
;
i
+=
blockSize
;
}
vx_cleanup
();
#elif CV_NEON
if
(
cv
::
checkHardwareSupport
(
CV_CPU_NEON
)
)
{
...
...
@@ -3113,42 +3095,40 @@ static double dotProd_8s(const schar* src1, const schar* src2, int len)
double
r
=
0.0
;
int
i
=
0
;
#if CV_SIMD128
if
(
hasSIMD128
())
#if CV_SIMD
int
len0
=
len
&
-
v_int16
::
nlanes
,
blockSize0
=
(
1
<<
14
),
blockSize
;
while
(
i
<
len0
)
{
int
len0
=
len
&
-
8
,
blockSize0
=
(
1
<<
14
),
blockSize
;
blockSize
=
std
::
min
(
len0
-
i
,
blockSize0
);
v_int32
v_sum
=
vx_setzero_s32
();
const
int
cWidth
=
v_int16
::
nlanes
;
while
(
i
<
len0
)
int
j
=
0
;
for
(;
j
<=
blockSize
-
cWidth
*
2
;
j
+=
cWidth
*
2
)
{
blockSize
=
std
::
min
(
len0
-
i
,
blockSize0
);
v_int32x4
v_sum
=
v_setzero_s32
();
const
int
cWidth
=
v_int16x8
::
nlanes
;
int
j
=
0
;
for
(;
j
<=
blockSize
-
cWidth
*
2
;
j
+=
cWidth
*
2
)
{
v_int16x8
v_src10
,
v_src20
,
v_src11
,
v_src21
;
v_expand
(
v_load
(
src1
+
j
),
v_src10
,
v_src11
);
v_expand
(
v_load
(
src2
+
j
),
v_src20
,
v_src21
);
v_int16
v_src10
,
v_src20
,
v_src11
,
v_src21
;
v_expand
(
vx_load
(
src1
+
j
),
v_src10
,
v_src11
);
v_expand
(
vx_load
(
src2
+
j
),
v_src20
,
v_src21
);
v_sum
+=
v_dotprod
(
v_src10
,
v_src20
);
v_sum
+=
v_dotprod
(
v_src11
,
v_src21
);
}
for
(;
j
<=
blockSize
-
cWidth
;
j
+=
cWidth
)
{
v_int16x8
v_src10
=
v_load_expand
(
src1
+
j
);
v_int16x8
v_src20
=
v_load_expand
(
src2
+
j
);
v_sum
+=
v_dotprod
(
v_src10
,
v_src20
);
v_sum
+=
v_dotprod
(
v_src11
,
v_src21
);
}
v_sum
+=
v_dotprod
(
v_src10
,
v_src20
);
}
r
+=
(
double
)
v_reduce_sum
(
v_sum
);
for
(;
j
<=
blockSize
-
cWidth
;
j
+=
cWidth
)
{
v_int16
v_src10
=
vx_load_expand
(
src1
+
j
);
v_int16
v_src20
=
vx_load_expand
(
src2
+
j
);
src1
+=
blockSize
;
src2
+=
blockSize
;
i
+=
blockSize
;
v_sum
+=
v_dotprod
(
v_src10
,
v_src20
);
}
r
+=
(
double
)
v_reduce_sum
(
v_sum
);
src1
+=
blockSize
;
src2
+=
blockSize
;
i
+=
blockSize
;
}
vx_cleanup
();
#elif CV_NEON
if
(
cv
::
checkHardwareSupport
(
CV_CPU_NEON
)
)
{
...
...
@@ -3232,28 +3212,26 @@ static double dotProd_32f(const float* src1, const float* src2, int len)
#endif
int
i
=
0
;
#if CV_SIMD128
if
(
hasSIMD128
())
{
int
len0
=
len
&
-
4
,
blockSize0
=
(
1
<<
13
),
blockSize
;
#if CV_SIMD
int
len0
=
len
&
-
v_float32
::
nlanes
,
blockSize0
=
(
1
<<
13
),
blockSize
;
while
(
i
<
len0
)
{
blockSize
=
std
::
min
(
len0
-
i
,
blockSize0
);
v_float32x4
v_sum
=
v
_setzero_f32
();
while
(
i
<
len0
)
{
blockSize
=
std
::
min
(
len0
-
i
,
blockSize0
);
v_float32
v_sum
=
vx
_setzero_f32
();
int
j
=
0
;
int
cWidth
=
v_float32x4
::
nlanes
;
for
(;
j
<=
blockSize
-
cWidth
;
j
+=
cWidth
)
v_sum
=
v_muladd
(
v_load
(
src1
+
j
),
v
_load
(
src2
+
j
),
v_sum
);
int
j
=
0
;
int
cWidth
=
v_float32
::
nlanes
;
for
(;
j
<=
blockSize
-
cWidth
;
j
+=
cWidth
)
v_sum
=
v_muladd
(
vx_load
(
src1
+
j
),
vx
_load
(
src2
+
j
),
v_sum
);
r
+=
v_reduce_sum
(
v_sum
);
r
+=
v_reduce_sum
(
v_sum
);
src1
+=
blockSize
;
src2
+=
blockSize
;
i
+=
blockSize
;
}
src1
+=
blockSize
;
src2
+=
blockSize
;
i
+=
blockSize
;
}
vx_cleanup
();
#endif
return
r
+
dotProd_
(
src1
,
src2
,
len
-
i
);
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment