Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
048ddbf9
Commit
048ddbf9
authored
Aug 31, 2019
by
Alexander Alekhin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #15339 from pmur:dotprod-32s-vsx
parents
2a6527e7
33fb253a
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
38 additions
and
0 deletions
+38
-0
intrin_vsx.hpp
modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+9
-0
matmul.simd.hpp
modules/core/src/matmul.simd.hpp
+29
-0
No files found.
modules/core/include/opencv2/core/hal/intrin_vsx.hpp
View file @
048ddbf9
...
@@ -1039,6 +1039,15 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)
...
@@ -1039,6 +1039,15 @@ inline v_float64x2 v_cvt_f64(const v_float32x4& a)
inline
v_float64x2
v_cvt_f64_high
(
const
v_float32x4
&
a
)
inline
v_float64x2
v_cvt_f64_high
(
const
v_float32x4
&
a
)
{
return
v_float64x2
(
vec_cvfo
(
vec_mergel
(
a
.
val
,
a
.
val
)));
}
{
return
v_float64x2
(
vec_cvfo
(
vec_mergel
(
a
.
val
,
a
.
val
)));
}
// The altivec intrinsic is missing for this 2.06 insn
inline
v_float64x2
v_cvt_f64
(
const
v_int64x2
&
a
)
{
vec_double2
out
;
__asm__
(
"xvcvsxddp %x0,%x1"
:
"=wa"
(
out
)
:
"wa"
(
a
.
val
));
return
v_float64x2
(
out
);
}
////////////// Lookup table access ////////////////////
////////////// Lookup table access ////////////////////
inline
v_int8x16
v_lut
(
const
schar
*
tab
,
const
int
*
idx
)
inline
v_int8x16
v_lut
(
const
schar
*
tab
,
const
int
*
idx
)
...
...
modules/core/src/matmul.simd.hpp
View file @
048ddbf9
...
@@ -2493,7 +2493,36 @@ double dotProd_16s(const short* src1, const short* src2, int len)
...
@@ -2493,7 +2493,36 @@ double dotProd_16s(const short* src1, const short* src2, int len)
double
dotProd_32s
(
const
int
*
src1
,
const
int
*
src2
,
int
len
)
double
dotProd_32s
(
const
int
*
src1
,
const
int
*
src2
,
int
len
)
{
{
#if CV_SIMD128_64F
double
r
=
0.0
;
int
i
=
0
;
int
lenAligned
=
len
&
-
v_int32x4
::
nlanes
;
v_float64x2
a
(
0.0
,
0.0
);
v_float64x2
b
(
0.0
,
0.0
);
for
(
i
=
0
;
i
<
lenAligned
;
i
+=
v_int32x4
::
nlanes
)
{
v_int32x4
s1
=
v_load
(
src1
);
v_int32x4
s2
=
v_load
(
src2
);
#if CV_VSX
// Do 32x32->64 multiplies, convert/round to double, accumulate
// Potentially less precise than FMA, but 1.5x faster than fma below.
a
+=
v_cvt_f64
(
v_int64
(
vec_mule
(
s1
.
val
,
s2
.
val
)));
b
+=
v_cvt_f64
(
v_int64
(
vec_mulo
(
s1
.
val
,
s2
.
val
)));
#else
a
=
v_fma
(
v_cvt_f64
(
s1
),
v_cvt_f64
(
s2
),
a
);
b
=
v_fma
(
v_cvt_f64_high
(
s1
),
v_cvt_f64_high
(
s2
),
b
);
#endif
src1
+=
v_int32x4
::
nlanes
;
src2
+=
v_int32x4
::
nlanes
;
}
a
+=
b
;
r
=
v_reduce_sum
(
a
);
return
r
+
dotProd_
(
src1
,
src2
,
len
-
i
);
#else
return
dotProd_
(
src1
,
src2
,
len
);
return
dotProd_
(
src1
,
src2
,
len
);
#endif
}
}
double
dotProd_32f
(
const
float
*
src1
,
const
float
*
src2
,
int
len
)
double
dotProd_32f
(
const
float
*
src1
,
const
float
*
src2
,
int
len
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment