Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
751cee8e
Commit
751cee8e
authored
Nov 16, 2017
by
Maksim Shabunin
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #9907 from seiko2plus:vsxFixesImproves
parents
0608227e
2dc76d50
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
555 additions
and
261 deletions
+555
-261
intrin_vsx.hpp
modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+107
-57
vsx_utils.hpp
modules/core/include/opencv2/core/vsx_utils.hpp
+297
-204
rand.cpp
modules/core/src/rand.cpp
+16
-0
ppc64-gnu.toolchain.cmake
platforms/linux/ppc64-gnu.toolchain.cmake
+3
-0
ppc64le-gnu.toolchain.cmake
platforms/linux/ppc64le-gnu.toolchain.cmake
+3
-0
ppcat.toolchain.cmake
platforms/linux/ppcat.toolchain.cmake
+129
-0
No files found.
modules/core/include/opencv2/core/hal/intrin_vsx.hpp
View file @
751cee8e
...
...
@@ -523,24 +523,25 @@ OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_add_wrap, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_FUNC
(
v_sub_wrap
,
vec_sub
)
/** Bitwise shifts **/
#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpuvec, splfunc) \
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
{ return _Tpuvec(vec_sl(a.val, splfunc(imm))); } \
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
{ return _Tpuvec(vec_sr(a.val, splfunc(imm))); } \
template<int imm> inline _Tpuvec v_shl(const _Tpuvec& a) \
{ return _Tpuvec(vec_sl(a.val, splfunc(imm))); } \
template<int imm> inline _Tpuvec v_shr(const _Tpuvec& a) \
{ return _Tpuvec(vec_sr(a.val, splfunc(imm))); }
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_uint8x16
,
vec_uchar16_sp
)
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_int8x16
,
vec_uchar16_sp
)
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_uint16x8
,
vec_ushort8_sp
)
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_int16x8
,
vec_ushort8_sp
)
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_uint32x4
,
vec_uint4_sp
)
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_int32x4
,
vec_uint4_sp
)
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_uint64x2
,
vec_udword2_sp
)
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_int64x2
,
vec_udword2_sp
)
#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc) \
inline _Tpvec operator << (const _Tpvec& a, int imm) \
{ return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
inline _Tpvec operator >> (const _Tpvec& a, int imm) \
{ return _Tpvec(shr(a.val, splfunc(imm))); } \
template<int imm> inline _Tpvec v_shl(const _Tpvec& a) \
{ return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
template<int imm> inline _Tpvec v_shr(const _Tpvec& a) \
{ return _Tpvec(shr(a.val, splfunc(imm))); }
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_uint8x16
,
vec_sr
,
vec_uchar16_sp
)
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_uint16x8
,
vec_sr
,
vec_ushort8_sp
)
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_uint32x4
,
vec_sr
,
vec_uint4_sp
)
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_uint64x2
,
vec_sr
,
vec_udword2_sp
)
// algebraic right shift
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_int8x16
,
vec_sra
,
vec_uchar16_sp
)
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_int16x8
,
vec_sra
,
vec_ushort8_sp
)
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_int32x4
,
vec_sra
,
vec_uint4_sp
)
OPENCV_HAL_IMPL_VSX_SHIFT_OP
(
v_int64x2
,
vec_sra
,
vec_udword2_sp
)
/** Bitwise logic **/
#define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec) \
...
...
@@ -605,6 +606,64 @@ OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int64x2)
OPENCV_HAL_IMPL_VSX_BIN_FUNC
(
v_min
,
vec_min
)
OPENCV_HAL_IMPL_VSX_BIN_FUNC
(
v_max
,
vec_max
)
/** Rotate **/
#define OPENCV_IMPL_VSX_ROTATE(_Tpvec, suffix, shf, cast) \
template<int imm> \
inline _Tpvec v_rotate_##suffix(const _Tpvec& a) \
{ \
const int wd = imm * sizeof(typename _Tpvec::lane_type); \
if (wd > 15) \
return _Tpvec(); \
return _Tpvec((cast)shf(vec_uchar16_c(a.val), vec_uchar16_sp(wd << 3))); \
}
#define OPENCV_IMPL_VSX_ROTATE_LR(_Tpvec, cast) \
OPENCV_IMPL_VSX_ROTATE(_Tpvec, left, vec_slo, cast) \
OPENCV_IMPL_VSX_ROTATE(_Tpvec, right, vec_sro, cast)
OPENCV_IMPL_VSX_ROTATE_LR
(
v_uint8x16
,
vec_uchar16
)
OPENCV_IMPL_VSX_ROTATE_LR
(
v_int8x16
,
vec_char16
)
OPENCV_IMPL_VSX_ROTATE_LR
(
v_uint16x8
,
vec_ushort8
)
OPENCV_IMPL_VSX_ROTATE_LR
(
v_int16x8
,
vec_short8
)
OPENCV_IMPL_VSX_ROTATE_LR
(
v_uint32x4
,
vec_uint4
)
OPENCV_IMPL_VSX_ROTATE_LR
(
v_int32x4
,
vec_int4
)
OPENCV_IMPL_VSX_ROTATE_LR
(
v_uint64x2
,
vec_udword2
)
OPENCV_IMPL_VSX_ROTATE_LR
(
v_int64x2
,
vec_dword2
)
template
<
int
imm
,
typename
_Tpvec
>
inline
_Tpvec
v_rotate_right
(
const
_Tpvec
&
a
,
const
_Tpvec
&
b
)
{
const
int
wd
=
imm
*
sizeof
(
typename
_Tpvec
::
lane_type
);
if
(
wd
==
0
)
return
a
;
return
_Tpvec
(
vec_sld
(
b
.
val
,
a
.
val
,
16
-
wd
));
}
template
<
int
imm
,
typename
_Tpvec
>
inline
_Tpvec
v_rotate_left
(
const
_Tpvec
&
a
,
const
_Tpvec
&
b
)
{
const
int
wd
=
imm
*
sizeof
(
typename
_Tpvec
::
lane_type
);
if
(
wd
==
16
)
return
b
;
return
_Tpvec
(
vec_sld
(
a
.
val
,
b
.
val
,
wd
));
}
#define OPENCV_IMPL_VSX_ROTATE_64(_Tpvec, suffix, rg1, rg2) \
template<int imm> \
inline _Tpvec v_rotate_##suffix(const _Tpvec& a, const _Tpvec& b) \
{ \
if (imm == 1) \
return _Tpvec(vec_permi(rg1.val, rg2.val, 2)); \
return imm ? b : a; \
}
OPENCV_IMPL_VSX_ROTATE_64
(
v_int64x2
,
right
,
a
,
b
)
OPENCV_IMPL_VSX_ROTATE_64
(
v_uint64x2
,
right
,
a
,
b
)
OPENCV_IMPL_VSX_ROTATE_64
(
v_int64x2
,
left
,
b
,
a
)
OPENCV_IMPL_VSX_ROTATE_64
(
v_uint64x2
,
left
,
b
,
a
)
////////// Reduce and mask /////////
/** Reduce **/
...
...
@@ -726,7 +785,7 @@ inline int v_signmask(const v_float32x4& a)
inline
int
v_signmask
(
const
v_int64x2
&
a
)
{
const
vec_dword2
sv
=
vec_sr
(
a
.
val
,
vec_udword2_sp
(
63
));
VSX_UNUSED
(
const
vec_dword2
)
sv
=
vec_sr
(
a
.
val
,
vec_udword2_sp
(
63
));
return
(
int
)
vec_extract
(
sv
,
0
)
|
(
int
)
vec_extract
(
sv
,
1
)
<<
1
;
}
inline
int
v_signmask
(
const
v_uint64x2
&
a
)
...
...
@@ -812,66 +871,47 @@ OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int64x2, v_uint64x2, vec_udword2_c, v_absdiff, v
/** Rounding **/
inline
v_int32x4
v_round
(
const
v_float32x4
&
a
)
{
return
v_int32x4
(
vec_cts
(
vec_round
(
a
.
val
)
,
0
));
}
{
return
v_int32x4
(
vec_cts
(
vec_round
(
a
.
val
)));
}
inline
v_int32x4
v_round
(
const
v_float64x2
&
a
)
{
static
const
vec_uchar16
perm
=
{
16
,
17
,
18
,
19
,
24
,
25
,
26
,
27
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
return
v_int32x4
(
vec_perm
(
vec_int4_z
,
vec_ctsw
(
vec_round
(
a
.
val
)),
perm
));
}
{
return
v_int32x4
(
vec_mergesqo
(
vec_cts
(
vec_round
(
a
.
val
)),
vec_int4_z
));
}
inline
v_int32x4
v_floor
(
const
v_float32x4
&
a
)
{
return
v_int32x4
(
vec_cts
(
vec_floor
(
a
.
val
)
,
0
));
}
{
return
v_int32x4
(
vec_cts
(
vec_floor
(
a
.
val
)));
}
inline
v_int32x4
v_floor
(
const
v_float64x2
&
a
)
{
static
const
vec_uchar16
perm
=
{
16
,
17
,
18
,
19
,
24
,
25
,
26
,
27
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
return
v_int32x4
(
vec_perm
(
vec_int4_z
,
vec_ctsw
(
vec_floor
(
a
.
val
)),
perm
));
}
{
return
v_int32x4
(
vec_mergesqo
(
vec_cts
(
vec_floor
(
a
.
val
)),
vec_int4_z
));
}
inline
v_int32x4
v_ceil
(
const
v_float32x4
&
a
)
{
return
v_int32x4
(
vec_cts
(
vec_ceil
(
a
.
val
)
,
0
));
}
{
return
v_int32x4
(
vec_cts
(
vec_ceil
(
a
.
val
)));
}
inline
v_int32x4
v_ceil
(
const
v_float64x2
&
a
)
{
static
const
vec_uchar16
perm
=
{
16
,
17
,
18
,
19
,
24
,
25
,
26
,
27
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
return
v_int32x4
(
vec_perm
(
vec_int4_z
,
vec_ctsw
(
vec_ceil
(
a
.
val
)),
perm
));
}
{
return
v_int32x4
(
vec_mergesqo
(
vec_cts
(
vec_ceil
(
a
.
val
)),
vec_int4_z
));
}
inline
v_int32x4
v_trunc
(
const
v_float32x4
&
a
)
{
return
v_int32x4
(
vec_cts
(
a
.
val
,
0
));
}
{
return
v_int32x4
(
vec_cts
(
a
.
val
));
}
inline
v_int32x4
v_trunc
(
const
v_float64x2
&
a
)
{
static
const
vec_uchar16
perm
=
{
16
,
17
,
18
,
19
,
24
,
25
,
26
,
27
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
return
v_int32x4
(
vec_perm
(
vec_int4_z
,
vec_ctsw
(
a
.
val
),
perm
));
}
{
return
v_int32x4
(
vec_mergesqo
(
vec_cts
(
a
.
val
),
vec_int4_z
));
}
/** To float **/
inline
v_float32x4
v_cvt_f32
(
const
v_int32x4
&
a
)
{
return
v_float32x4
(
vec_ctf
(
a
.
val
,
0
));
}
{
return
v_float32x4
(
vec_ctf
(
a
.
val
));
}
inline
v_float32x4
v_cvt_f32
(
const
v_float64x2
&
a
)
{
static
const
vec_uchar16
perm
=
{
16
,
17
,
18
,
19
,
24
,
25
,
26
,
27
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
};
return
v_float32x4
(
vec_perm
(
vec_float4_z
,
vec_cvf
(
a
.
val
),
perm
));
}
{
return
v_float32x4
(
vec_mergesqo
(
vec_cvf
(
a
.
val
),
vec_float4_z
));
}
inline
v_float64x2
v_cvt_f64
(
const
v_int32x4
&
a
)
{
return
v_float64x2
(
vec_ctd
(
vec_mergeh
(
a
.
val
,
a
.
val
),
0
));
}
{
return
v_float64x2
(
vec_ctdo
(
vec_mergeh
(
a
.
val
,
a
.
val
)));
}
inline
v_float64x2
v_cvt_f64_high
(
const
v_int32x4
&
a
)
{
return
v_float64x2
(
vec_ctd
(
vec_mergel
(
a
.
val
,
a
.
val
),
0
));
}
{
return
v_float64x2
(
vec_ctdo
(
vec_mergel
(
a
.
val
,
a
.
val
)));
}
inline
v_float64x2
v_cvt_f64
(
const
v_float32x4
&
a
)
{
return
v_float64x2
(
vec_cvf
(
vec_mergeh
(
a
.
val
,
a
.
val
)));
}
{
return
v_float64x2
(
vec_cvfo
(
vec_mergeh
(
a
.
val
,
a
.
val
)));
}
inline
v_float64x2
v_cvt_f64_high
(
const
v_float32x4
&
a
)
{
return
v_float64x2
(
vec_cvf
(
vec_mergel
(
a
.
val
,
a
.
val
)));
}
{
return
v_float64x2
(
vec_cvfo
(
vec_mergel
(
a
.
val
,
a
.
val
)));
}
/** Reinterpret **/
/** its up there with load and store operations **/
...
...
@@ -888,10 +928,20 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
const
vec_float4
v0
=
vec_splat
(
v
.
val
,
0
);
const
vec_float4
v1
=
vec_splat
(
v
.
val
,
1
);
const
vec_float4
v2
=
vec_splat
(
v
.
val
,
2
);
const
vec_float4
v3
=
vec_splat
(
v
.
val
,
3
);
VSX_UNUSED
(
const
vec_float4
)
v3
=
vec_splat
(
v
.
val
,
3
);
return
v_float32x4
(
vec_madd
(
v0
,
m0
.
val
,
vec_madd
(
v1
,
m1
.
val
,
vec_madd
(
v2
,
m2
.
val
,
vec_mul
(
v3
,
m3
.
val
)))));
}
inline
v_float32x4
v_matmuladd
(
const
v_float32x4
&
v
,
const
v_float32x4
&
m0
,
const
v_float32x4
&
m1
,
const
v_float32x4
&
m2
,
const
v_float32x4
&
a
)
{
const
vec_float4
v0
=
vec_splat
(
v
.
val
,
0
);
const
vec_float4
v1
=
vec_splat
(
v
.
val
,
1
);
const
vec_float4
v2
=
vec_splat
(
v
.
val
,
2
);
return
v_float32x4
(
vec_madd
(
v0
,
m0
.
val
,
vec_madd
(
v1
,
m1
.
val
,
vec_madd
(
v2
,
m2
.
val
,
a
.
val
))));
}
#define OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(_Tpvec, _Tpvec2) \
inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
const _Tpvec& a2, const _Tpvec& a3, \
...
...
modules/core/include/opencv2/core/vsx_utils.hpp
View file @
751cee8e
...
...
@@ -51,18 +51,6 @@
//! @{
#if CV_VSX
#define FORCE_INLINE(tp) extern inline tp __attribute__((always_inline))
#define VSX_REDIRECT_1RG(rt, rg, fnm, fn2) \
FORCE_INLINE(rt) fnm(const rg& a) { return fn2(a); }
#define VSX_REDIRECT_2RG(rt, rg, fnm, fn2) \
FORCE_INLINE(rt) fnm(const rg& a, const rg& b) { return fn2(a, b); }
#define VSX_IMPL_PERM(rt, fnm, ...) \
FORCE_INLINE(rt) fnm(const rt& a, const rt& b) \
{ static const vec_uchar16 perm = {__VA_ARGS__}; return vec_perm(a, b, perm); }
#define __VSX_S16__(c, v) (c){v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}
#define __VSX_S8__(c, v) (c){v, v, v, v, v, v, v, v}
#define __VSX_S4__(c, v) (c){v, v, v, v}
...
...
@@ -172,10 +160,19 @@ typedef __vector double vec_double2;
#define vec_bdword2_f (__VSX_S2__(vec_bdword2, 0))
#define vec_bdword2_t (__VSX_S2__(vec_bdword2, 1))
#define FORCE_INLINE(tp) extern inline tp __attribute__((always_inline))
#define VSX_REDIRECT_1RG(rt, rg, fnm, fn2) \
FORCE_INLINE(rt) fnm(const rg& a) { return fn2(a); }
#define VSX_REDIRECT_2RG(rt, rg, fnm, fn2) \
FORCE_INLINE(rt) fnm(const rg& a, const rg& b) { return fn2(a, b); }
/*
* GCC VSX compatibility
**/
#if defined(__GNUG__) && !defined(__
IBMCPP__) && !defined(__
clang__)
#if defined(__GNUG__) && !defined(__clang__)
// inline asm helper
#define VSX_IMPL_1RG(rt, rto, rg, rgo, opc, fnm) \
...
...
@@ -193,7 +190,7 @@ FORCE_INLINE(rt) fnm(const rg& a, const rg& b) \
#define VSX_IMPL_2VRG(rt, rg, opc, fnm) VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%1,%2", fnm)
#if __GNUG__ < 7
/
* up to GCC 6 vec_mul only supports precisions and llong */
/
/ up to GCC 6 vec_mul only supports precisions and llong
# ifdef vec_mul
# undef vec_mul
# endif
...
...
@@ -209,15 +206,15 @@ FORCE_INLINE(rt) fnm(const rg& a, const rg& b) \
8, 9, 24, 25, 12, 13, 28, 29}; \
return vec_perm(Tcast(vec_mule(a, b)), Tcast(vec_mulo(a, b)), even_perm); \
}
VSX_IMPL_MULH
(
vec_short8
,
vec_short8_c
)
VSX_IMPL_MULH
(
vec_short8
,
vec_short8_c
)
VSX_IMPL_MULH
(
vec_ushort8
,
vec_ushort8_c
)
/
* vmuluwm can be used for unsigned or signed integers, that's what they said */
VSX_IMPL_2VRG
(
vec_int4
,
vec_int4
,
vmuluwm
,
vec_mul
)
/
/ vmuluwm can be used for unsigned or signed integers, that's what they said
VSX_IMPL_2VRG
(
vec_int4
,
vec_int4
,
vmuluwm
,
vec_mul
)
VSX_IMPL_2VRG
(
vec_uint4
,
vec_uint4
,
vmuluwm
,
vec_mul
)
/
* redirect to GCC builtin vec_mul, since it already supports precisions and llong */
VSX_REDIRECT_2RG
(
vec_float4
,
vec_float4
,
vec_mul
,
__builtin_vec_mul
)
/
/ redirect to GCC builtin vec_mul, since it already supports precisions and llong
VSX_REDIRECT_2RG
(
vec_float4
,
vec_float4
,
vec_mul
,
__builtin_vec_mul
)
VSX_REDIRECT_2RG
(
vec_double2
,
vec_double2
,
vec_mul
,
__builtin_vec_mul
)
VSX_REDIRECT_2RG
(
vec_dword2
,
vec_dword2
,
vec_mul
,
__builtin_vec_mul
)
VSX_REDIRECT_2RG
(
vec_dword2
,
vec_dword2
,
vec_mul
,
__builtin_vec_mul
)
VSX_REDIRECT_2RG
(
vec_udword2
,
vec_udword2
,
vec_mul
,
__builtin_vec_mul
)
#endif // __GNUG__ < 7
...
...
@@ -237,74 +234,120 @@ FORCE_INLINE(rt) fnm(const rg& a, const rg& b) \
# define VSX_IMPL_CMPGE(rt, rg, opc, fnm) \
VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%2,%1\n\t xxlnor %x0,%x0,%x0", fnm)
VSX_IMPL_CMPGE
(
vec_bchar16
,
vec_char16
,
vcmpgtsb
,
vec_cmpge
)
VSX_IMPL_CMPGE
(
vec_bchar16
,
vec_char16
,
vcmpgtsb
,
vec_cmpge
)
VSX_IMPL_CMPGE
(
vec_bchar16
,
vec_uchar16
,
vcmpgtub
,
vec_cmpge
)
VSX_IMPL_CMPGE
(
vec_bshort8
,
vec_short8
,
vcmpgtsh
,
vec_cmpge
)
VSX_IMPL_CMPGE
(
vec_bshort8
,
vec_short8
,
vcmpgtsh
,
vec_cmpge
)
VSX_IMPL_CMPGE
(
vec_bshort8
,
vec_ushort8
,
vcmpgtuh
,
vec_cmpge
)
VSX_IMPL_CMPGE
(
vec_bint4
,
vec_int4
,
vcmpgtsw
,
vec_cmpge
)
VSX_IMPL_CMPGE
(
vec_bint4
,
vec_uint4
,
vcmpgtuw
,
vec_cmpge
)
VSX_IMPL_CMPGE
(
vec_bdword2
,
vec_dword2
,
vcmpgtsd
,
vec_cmpge
)
VSX_IMPL_CMPGE
(
vec_bint4
,
vec_int4
,
vcmpgtsw
,
vec_cmpge
)
VSX_IMPL_CMPGE
(
vec_bint4
,
vec_uint4
,
vcmpgtuw
,
vec_cmpge
)
VSX_IMPL_CMPGE
(
vec_bdword2
,
vec_dword2
,
vcmpgtsd
,
vec_cmpge
)
VSX_IMPL_CMPGE
(
vec_bdword2
,
vec_udword2
,
vcmpgtud
,
vec_cmpge
)
/* redirect to GCC builtin cmpge, since it already supports precisions */
VSX_REDIRECT_2RG
(
vec_bint4
,
vec_float4
,
vec_cmpge
,
__builtin_vec_cmpge
)
// redirect to GCC builtin cmpge, since it already supports precisions
VSX_REDIRECT_2RG
(
vec_bint4
,
vec_float4
,
vec_cmpge
,
__builtin_vec_cmpge
)
VSX_REDIRECT_2RG
(
vec_bdword2
,
vec_double2
,
vec_cmpge
,
__builtin_vec_cmpge
)
// up to gcc5 vec_nor doesn't support bool long long
# undef vec_nor
template
<
typename
T
>
VSX_REDIRECT_2RG
(
T
,
T
,
vec_nor
,
__builtin_vec_nor
)
template
<
typename
T
>
VSX_REDIRECT_2RG
(
T
,
T
,
vec_nor
,
__builtin_vec_nor
)
FORCE_INLINE
(
vec_bdword2
)
vec_nor
(
const
vec_bdword2
&
a
,
const
vec_bdword2
&
b
)
{
return
vec_bdword2_c
(
__builtin_vec_nor
(
vec_dword2_c
(
a
),
vec_dword2_c
(
b
)));
}
FORCE_INLINE
(
vec_bdword2
)
vec_nor
(
const
vec_bdword2
&
a
,
const
vec_bdword2
&
b
)
{
return
vec_bdword2_c
(
__builtin_vec_nor
(
vec_dword2_c
(
a
),
vec_dword2_c
(
b
)));
}
#endif // __GNUG__ < 6
// vector population count
#ifndef vec_popcnt
VSX_IMPL_1VRG
(
vec_uchar16
,
vec_uchar16
,
vpopcntb
,
vec_popcnt
)
VSX_IMPL_1VRG
(
vec_uchar16
,
vec_char16
,
vpopcntb
,
vec_popcnt
)
VSX_IMPL_1VRG
(
vec_ushort8
,
vec_ushort8
,
vpopcnth
,
vec_popcnt
)
VSX_IMPL_1VRG
(
vec_ushort8
,
vec_short8
,
vpopcnth
,
vec_popcnt
)
VSX_IMPL_1VRG
(
vec_uint4
,
vec_uint4
,
vpopcntw
,
vec_popcnt
)
VSX_IMPL_1VRG
(
vec_uint4
,
vec_int4
,
vpopcntw
,
vec_popcnt
)
VSX_IMPL_1VRG
(
vec_udword2
,
vec_udword2
,
vpopcntd
,
vec_popcnt
)
VSX_IMPL_1VRG
(
vec_udword2
,
vec_dword2
,
vpopcntd
,
vec_popcnt
)
#endif // vec_popcnt
#if __GNUG__ < 5
// vec_xxpermdi in gcc4 missing little-endian supports just like clang
# define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ ((c & 1) << 1 | c >> 1)))
// vec_packs doesn't support double words in gcc4
# undef vec_packs
VSX_REDIRECT_2RG
(
vec_char16
,
vec_short8
,
vec_packs
,
__builtin_vec_packs
)
VSX_REDIRECT_2RG
(
vec_uchar16
,
vec_ushort8
,
vec_packs
,
__builtin_vec_packs
)
VSX_REDIRECT_2RG
(
vec_short8
,
vec_int4
,
vec_packs
,
__builtin_vec_packs
)
VSX_REDIRECT_2RG
(
vec_ushort8
,
vec_uint4
,
vec_packs
,
__builtin_vec_packs
)
VSX_IMPL_2VRG_F
(
vec_int4
,
vec_dword2
,
"vpksdss %0,%2,%1"
,
vec_packs
)
VSX_IMPL_2VRG_F
(
vec_uint4
,
vec_udword2
,
"vpkudus %0,%2,%1"
,
vec_packs
)
# undef vec_packs
VSX_REDIRECT_2RG
(
vec_char16
,
vec_short8
,
vec_packs
,
__builtin_vec_packs
)
VSX_REDIRECT_2RG
(
vec_uchar16
,
vec_ushort8
,
vec_packs
,
__builtin_vec_packs
)
VSX_REDIRECT_2RG
(
vec_short8
,
vec_int4
,
vec_packs
,
__builtin_vec_packs
)
VSX_REDIRECT_2RG
(
vec_ushort8
,
vec_uint4
,
vec_packs
,
__builtin_vec_packs
)
VSX_IMPL_2VRG_F
(
vec_int4
,
vec_dword2
,
"vpksdss %0,%2,%1"
,
vec_packs
)
VSX_IMPL_2VRG_F
(
vec_uint4
,
vec_udword2
,
"vpkudus %0,%2,%1"
,
vec_packs
)
#else
# define vec_permi vec_xxpermdi
#endif // __GNUG__ < 5
// shift left double by word immediate
#ifndef vec_sldw
# define vec_sldw __builtin_vsx_xxsldwi
#endif
// vector population count
VSX_IMPL_1VRG
(
vec_uchar16
,
vec_uchar16
,
vpopcntb
,
vec_popcntu
)
VSX_IMPL_1VRG
(
vec_uchar16
,
vec_char16
,
vpopcntb
,
vec_popcntu
)
VSX_IMPL_1VRG
(
vec_ushort8
,
vec_ushort8
,
vpopcnth
,
vec_popcntu
)
VSX_IMPL_1VRG
(
vec_ushort8
,
vec_short8
,
vpopcnth
,
vec_popcntu
)
VSX_IMPL_1VRG
(
vec_uint4
,
vec_uint4
,
vpopcntw
,
vec_popcntu
)
VSX_IMPL_1VRG
(
vec_uint4
,
vec_int4
,
vpopcntw
,
vec_popcntu
)
VSX_IMPL_1VRG
(
vec_udword2
,
vec_udword2
,
vpopcntd
,
vec_popcntu
)
VSX_IMPL_1VRG
(
vec_udword2
,
vec_dword2
,
vpopcntd
,
vec_popcntu
)
// converts between single and double-precision
#ifndef vec_cvf
VSX_REDIRECT_1RG
(
vec_float4
,
vec_double2
,
vec_cvf
,
__builtin_vsx_xvcvdpsp
)
FORCE_INLINE
(
vec_double2
)
vec_cvf
(
const
vec_float4
&
a
)
{
return
__builtin_vsx_xvcvspdp
(
vec_sld
(
a
,
a
,
4
));
}
#ifdef vec_cvf
# undef vec_cvf
#endif
VSX_REDIRECT_1RG
(
vec_float4
,
vec_double2
,
vec_cvf
,
__builtin_vsx_xvcvdpsp
)
VSX_REDIRECT_1RG
(
vec_double2
,
vec_float4
,
vec_cvfo
,
__builtin_vsx_xvcvspdp
)
FORCE_INLINE
(
vec_double2
)
vec_cvf
(
const
vec_float4
&
a
)
{
return
vec_cvfo
(
vec_sldw
(
a
,
a
,
1
));
}
// converts 32 and 64 bit integers to double-precision
#ifndef vec_ctd
# define vec_ctd(a, b) __vec_ctd(a)
VSX_IMPL_1RG
(
vec_double2
,
wd
,
vec_int4
,
wa
,
xvcvsxwdp
,
__vec_ctd
)
VSX_IMPL_1RG
(
vec_double2
,
wd
,
vec_uint4
,
wa
,
xvcvuxwdp
,
__vec_ctd
)
VSX_IMPL_1RG
(
vec_double2
,
wd
,
vec_dword2
,
wi
,
xvcvsxddp
,
__vec_ctd
)
VSX_IMPL_1RG
(
vec_double2
,
wd
,
vec_udword2
,
wi
,
xvcvuxddp
,
__vec_ctd
)
// converts word and doubleword to double-precision
#ifdef vec_ctd
# undef vec_ctd
#endif
VSX_IMPL_1RG
(
vec_double2
,
wd
,
vec_int4
,
wa
,
xvcvsxwdp
,
vec_ctdo
)
VSX_IMPL_1RG
(
vec_double2
,
wd
,
vec_uint4
,
wa
,
xvcvuxwdp
,
vec_ctdo
)
VSX_IMPL_1RG
(
vec_double2
,
wd
,
vec_dword2
,
wi
,
xvcvsxddp
,
vec_ctd
)
VSX_IMPL_1RG
(
vec_double2
,
wd
,
vec_udword2
,
wi
,
xvcvuxddp
,
vec_ctd
)
FORCE_INLINE
(
vec_double2
)
vec_ctd
(
const
vec_int4
&
a
)
{
return
vec_ctdo
(
vec_sldw
(
a
,
a
,
1
));
}
FORCE_INLINE
(
vec_double2
)
vec_ctd
(
const
vec_uint4
&
a
)
{
return
vec_ctdo
(
vec_sldw
(
a
,
a
,
1
));
}
// converts word and doubleword to single-precision
#undef vec_ctf
VSX_IMPL_1RG
(
vec_float4
,
wf
,
vec_int4
,
wa
,
xvcvsxwsp
,
vec_ctf
)
VSX_IMPL_1RG
(
vec_float4
,
wf
,
vec_uint4
,
wa
,
xvcvuxwsp
,
vec_ctf
)
VSX_IMPL_1RG
(
vec_float4
,
wf
,
vec_dword2
,
wi
,
xvcvsxdsp
,
vec_ctf
)
VSX_IMPL_1RG
(
vec_float4
,
wf
,
vec_udword2
,
wi
,
xvcvuxdsp
,
vec_ctf
)
// converts single and double precision to signed word
#undef vec_cts
VSX_IMPL_1RG
(
vec_int4
,
wa
,
vec_double2
,
wd
,
xvcvdpsxws
,
vec_cts
)
VSX_IMPL_1RG
(
vec_int4
,
wa
,
vec_float4
,
wf
,
xvcvspsxws
,
vec_cts
)
// converts single and double precision to unsigned word
#undef vec_ctu
VSX_IMPL_1RG
(
vec_uint4
,
wa
,
vec_double2
,
wd
,
xvcvdpuxws
,
vec_ctu
)
VSX_IMPL_1RG
(
vec_uint4
,
wa
,
vec_float4
,
wf
,
xvcvspuxws
,
vec_ctu
)
// converts single and double precision to signed doubleword
#ifdef vec_ctsl
# undef vec_ctsl
#endif
VSX_IMPL_1RG
(
vec_dword2
,
wi
,
vec_double2
,
wd
,
xvcvdpsxds
,
vec_ctsl
)
VSX_IMPL_1RG
(
vec_dword2
,
wi
,
vec_float4
,
wf
,
xvcvspsxds
,
vec_ctslo
)
// shift left double by word immediate
#ifndef vec_sldw
# define vec_sldw __builtin_vsx_xxsldwi
FORCE_INLINE
(
vec_dword2
)
vec_ctsl
(
const
vec_float4
&
a
)
{
return
vec_ctslo
(
vec_sldw
(
a
,
a
,
1
));
}
// converts single and double precision to unsigned doubleword
#ifdef vec_ctul
# undef vec_ctul
#endif
VSX_IMPL_1RG
(
vec_udword2
,
wi
,
vec_double2
,
wd
,
xvcvdpuxds
,
vec_ctul
)
VSX_IMPL_1RG
(
vec_udword2
,
wi
,
vec_float4
,
wf
,
xvcvspuxds
,
vec_ctulo
)
FORCE_INLINE
(
vec_udword2
)
vec_ctul
(
const
vec_float4
&
a
)
{
return
vec_ctulo
(
vec_sldw
(
a
,
a
,
1
));
}
// just in case if GCC doesn't define it
#ifndef vec_xl
...
...
@@ -327,8 +370,13 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
* Also there's already an open bug https://bugs.llvm.org/show_bug.cgi?id=31837
*
* So we're not able to use inline asm and only use built-in functions that CLANG supports
* and use __builtin_convertvector if clang missng any of vector conversions built-in functions
*/
// convert vector helper
#define VSX_IMPL_CONVERT(rt, rg, fnm) \
FORCE_INLINE(rt) fnm(const rg& a) { return __builtin_convertvector(a, rt); }
#if __clang_major__ < 5
// implement vec_permi in a dirty way
# define VSX_IMPL_CLANG_4_PERMI(Tvec) \
...
...
@@ -362,26 +410,6 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
# define vec_sldw vec_xxsldwi
#endif
/* converts between single and double precision */
#ifndef vec_cvf
VSX_REDIRECT_1RG
(
vec_float4
,
vec_double2
,
vec_cvf
,
__builtin_vsx_xvcvdpsp
)
FORCE_INLINE
(
vec_double2
)
vec_cvf
(
const
vec_float4
&
a
)
{
return
__builtin_vsx_xvcvspdp
(
vec_sld
(
a
,
a
,
4
));
}
#endif
/* converts 32 and 64 bit integers to double-precision */
#ifndef vec_ctd
# define vec_ctd(a, b) __vec_ctd(a)
VSX_REDIRECT_1RG
(
vec_double2
,
vec_int4
,
__vec_ctd
,
__builtin_vsx_xvcvsxwdp
)
VSX_REDIRECT_1RG
(
vec_double2
,
vec_uint4
,
__vec_ctd
,
__builtin_vsx_xvcvuxwdp
)
// implement vec_ctd for double word in a dirty way since we are missing builtin xvcvsxddp, xvcvuxddp
// please try to avoid using it for double words
FORCE_INLINE
(
vec_double2
)
__vec_ctd
(
const
vec_dword2
&
a
)
{
return
vec_double2_set
((
double
)
vec_extract
(
a
,
0
),
(
double
)
vec_extract
(
a
,
1
));
}
FORCE_INLINE
(
vec_double2
)
__vec_ctd
(
const
vec_udword2
&
a
)
{
return
vec_double2_set
((
double
)
vec_extract
(
a
,
0
),
(
double
)
vec_extract
(
a
,
1
));
}
#endif
// Implement vec_rsqrt since clang only supports vec_rsqrte
#ifndef vec_rsqrt
FORCE_INLINE
(
vec_float4
)
vec_rsqrt
(
const
vec_float4
&
a
)
...
...
@@ -391,27 +419,157 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
{
return
vec_div
(
vec_double2_sp
(
1
),
vec_sqrt
(
a
));
}
#endif
// vec_popcnt should return unsigned but clang has different thought just like gcc in vec_vpopcnt
#define VSX_IMPL_POPCNTU(Tvec, Tvec2, ucast) \
FORCE_INLINE(Tvec) vec_popcntu(const Tvec2& a) \
{ return ucast(vec_popcnt(a)); }
VSX_IMPL_POPCNTU
(
vec_uchar16
,
vec_char16
,
vec_uchar16_c
);
VSX_IMPL_POPCNTU
(
vec_ushort8
,
vec_short8
,
vec_ushort8_c
);
VSX_IMPL_POPCNTU
(
vec_uint4
,
vec_int4
,
vec_uint4_c
);
// redirect unsigned types
VSX_REDIRECT_1RG
(
vec_uchar16
,
vec_uchar16
,
vec_popcntu
,
vec_popcnt
)
VSX_REDIRECT_1RG
(
vec_ushort8
,
vec_ushort8
,
vec_popcntu
,
vec_popcnt
)
VSX_REDIRECT_1RG
(
vec_uint4
,
vec_uint4
,
vec_popcntu
,
vec_popcnt
)
// converts between single and double precision
#ifdef vec_cvf
# undef vec_cvf
#endif
VSX_REDIRECT_1RG
(
vec_float4
,
vec_double2
,
vec_cvf
,
__builtin_vsx_xvcvdpsp
)
VSX_REDIRECT_1RG
(
vec_double2
,
vec_float4
,
vec_cvfo
,
__builtin_vsx_xvcvspdp
)
/*
* __builtin_altivec_vctsxs in clang 5 and 6 causes ambiguous which used by vec_cts
* so we just redefine it and cast it
*/
FORCE_INLINE
(
vec_double2
)
vec_cvf
(
const
vec_float4
&
a
)
{
return
vec_cvfo
(
vec_sldw
(
a
,
a
,
1
));
}
// converts word and doubleword to double-precision
#ifdef vec_ctd
# undef vec_ctd
#endif
VSX_REDIRECT_1RG
(
vec_double2
,
vec_int4
,
vec_ctdo
,
__builtin_vsx_xvcvsxwdp
)
VSX_REDIRECT_1RG
(
vec_double2
,
vec_uint4
,
vec_ctdo
,
__builtin_vsx_xvcvuxwdp
)
VSX_IMPL_CONVERT
(
vec_double2
,
vec_dword2
,
vec_ctd
)
VSX_IMPL_CONVERT
(
vec_double2
,
vec_udword2
,
vec_ctd
)
FORCE_INLINE
(
vec_double2
)
vec_ctd
(
const
vec_int4
&
a
)
{
return
vec_ctdo
(
vec_sldw
(
a
,
a
,
1
));
}
FORCE_INLINE
(
vec_double2
)
vec_ctd
(
const
vec_uint4
&
a
)
{
return
vec_ctdo
(
vec_sldw
(
a
,
a
,
1
));
}
// converts word and doubleword to single-precision
#if __clang_major__ > 4
# undef vec_ctf
#endif
VSX_IMPL_CONVERT
(
vec_float4
,
vec_int4
,
vec_ctf
)
VSX_IMPL_CONVERT
(
vec_float4
,
vec_uint4
,
vec_ctf
)
VSX_REDIRECT_1RG
(
vec_float4
,
vec_dword2
,
vec_ctf
,
__builtin_vsx_xvcvsxdsp
)
VSX_REDIRECT_1RG
(
vec_float4
,
vec_udword2
,
vec_ctf
,
__builtin_vsx_xvcvuxdsp
)
// converts single and double precision to signed word
#if __clang_major__ > 4
# undef vec_cts
# define vec_cts(__a, __b) \
_Generic((__a), vector float \
: (vector signed int)__builtin_altivec_vctsxs((__a), (__b)), vector double \
: __extension__({ \
vector double __ret = \
(__a) * \
(vector double)(vector unsigned long long)((0x3ffULL + (__b)) \
<< 52); \
__builtin_convertvector(__ret, vector signed long long); \
}))
#endif // __clang_major__ > 4
#endif
VSX_REDIRECT_1RG
(
vec_int4
,
vec_double2
,
vec_cts
,
__builtin_vsx_xvcvdpsxws
)
VSX_IMPL_CONVERT
(
vec_int4
,
vec_float4
,
vec_cts
)
// converts single and double precision to unsigned word
#if __clang_major__ > 4
# undef vec_ctu
#endif
VSX_REDIRECT_1RG
(
vec_uint4
,
vec_double2
,
vec_ctu
,
__builtin_vsx_xvcvdpuxws
)
VSX_IMPL_CONVERT
(
vec_uint4
,
vec_float4
,
vec_ctu
)
// converts single and double precision to signed doubleword
#ifdef vec_ctsl
# undef vec_ctsl
#endif
VSX_IMPL_CONVERT
(
vec_dword2
,
vec_double2
,
vec_ctsl
)
// __builtin_convertvector unable to convert, xvcvspsxds is missing on it
FORCE_INLINE
(
vec_dword2
)
vec_ctslo
(
const
vec_float4
&
a
)
{
return
vec_ctsl
(
vec_cvfo
(
a
));
}
FORCE_INLINE
(
vec_dword2
)
vec_ctsl
(
const
vec_float4
&
a
)
{
return
vec_ctsl
(
vec_cvf
(
a
));
}
// converts single and double precision to unsigned doubleword
#ifdef vec_ctul
# undef vec_ctul
#endif
VSX_IMPL_CONVERT
(
vec_udword2
,
vec_double2
,
vec_ctul
)
// __builtin_convertvector unable to convert, xvcvspuxds is missing on it
FORCE_INLINE
(
vec_udword2
)
vec_ctulo
(
const
vec_float4
&
a
)
{
return
vec_ctul
(
vec_cvfo
(
a
));
}
FORCE_INLINE
(
vec_udword2
)
vec_ctul
(
const
vec_float4
&
a
)
{
return
vec_ctul
(
vec_cvf
(
a
));
}
#endif // CLANG VSX compatibility
/*
* XLC VSX compatibility
**/
#if defined(__IBMCPP__)
// vector population count
#define vec_popcntu vec_popcnt
// overload and redirect wih setting second arg to zero
// since we only support conversions without the second arg
#define VSX_IMPL_OVERLOAD_Z2(rt, rg, fnm) \
FORCE_INLINE(rt) fnm(const rg& a) { return fnm(a, 0); }
VSX_IMPL_OVERLOAD_Z2
(
vec_double2
,
vec_int4
,
vec_ctd
)
VSX_IMPL_OVERLOAD_Z2
(
vec_double2
,
vec_uint4
,
vec_ctd
)
VSX_IMPL_OVERLOAD_Z2
(
vec_double2
,
vec_dword2
,
vec_ctd
)
VSX_IMPL_OVERLOAD_Z2
(
vec_double2
,
vec_udword2
,
vec_ctd
)
VSX_IMPL_OVERLOAD_Z2
(
vec_float4
,
vec_int4
,
vec_ctf
)
VSX_IMPL_OVERLOAD_Z2
(
vec_float4
,
vec_uint4
,
vec_ctf
)
VSX_IMPL_OVERLOAD_Z2
(
vec_float4
,
vec_dword2
,
vec_ctf
)
VSX_IMPL_OVERLOAD_Z2
(
vec_float4
,
vec_udword2
,
vec_ctf
)
VSX_IMPL_OVERLOAD_Z2
(
vec_int4
,
vec_double2
,
vec_cts
)
VSX_IMPL_OVERLOAD_Z2
(
vec_int4
,
vec_float4
,
vec_cts
)
VSX_IMPL_OVERLOAD_Z2
(
vec_uint4
,
vec_double2
,
vec_ctu
)
VSX_IMPL_OVERLOAD_Z2
(
vec_uint4
,
vec_float4
,
vec_ctu
)
VSX_IMPL_OVERLOAD_Z2
(
vec_dword2
,
vec_double2
,
vec_ctsl
)
VSX_IMPL_OVERLOAD_Z2
(
vec_dword2
,
vec_float4
,
vec_ctsl
)
VSX_IMPL_OVERLOAD_Z2
(
vec_udword2
,
vec_double2
,
vec_ctul
)
VSX_IMPL_OVERLOAD_Z2
(
vec_udword2
,
vec_float4
,
vec_ctul
)
// fixme: implement conversions of odd-numbered elements in a dirty way
// since xlc doesn't support VSX registers operand in inline asm.
#define VSX_IMPL_DIRTY_ODD(rt, rg, fnm, fn2) \
FORCE_INLINE(rt) fnm(const rg& a) { return fn2(vec_sldw(a, a, 3)); }
VSX_IMPL_DIRTY_ODD
(
vec_double2
,
vec_float4
,
vec_cvfo
,
vec_cvf
)
VSX_IMPL_DIRTY_ODD
(
vec_double2
,
vec_int4
,
vec_ctdo
,
vec_ctd
)
VSX_IMPL_DIRTY_ODD
(
vec_double2
,
vec_uint4
,
vec_ctdo
,
vec_ctd
)
VSX_IMPL_DIRTY_ODD
(
vec_dword2
,
vec_float4
,
vec_ctslo
,
vec_ctsl
)
VSX_IMPL_DIRTY_ODD
(
vec_udword2
,
vec_float4
,
vec_ctulo
,
vec_ctul
)
#endif // XLC VSX compatibility
// ignore GCC warning that casued by -Wunused-but-set-variable in rare cases
#if defined(__GNUG__) && !defined(__clang__)
# define VSX_UNUSED(Tvec) Tvec __attribute__((__unused__))
#else // CLANG, XLC
# define VSX_UNUSED(Tvec) Tvec
#endif
// gcc can find his way in casting log int and XLC, CLANG ambiguous
#if defined(__clang__) || defined(__IBMCPP__)
FORCE_INLINE
(
vec_udword2
)
vec_splats
(
uint64
v
)
{
return
vec_splats
((
unsigned
long
long
)
v
);
}
FORCE_INLINE
(
vec_dword2
)
vec_splats
(
int64
v
)
{
return
vec_splats
((
long
long
)
v
);
}
#endif
/*
* implement vsx_ld(offset, pointer), vsx_st(vector, offset, pointer)
* load and set using offset depend on the pointer type
...
...
@@ -468,75 +626,6 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
{
vsx_stf
(
vec
,
VSX_OFFSET
(
o
,
p
),
(
long
long
*
)
p
);
}
#endif
#if defined(__clang__) || defined(__IBMCPP__)
// gcc can find his way in casting log int and XLC, CLANG ambiguous
FORCE_INLINE
(
vec_udword2
)
vec_splats
(
uint64
v
)
{
return
vec_splats
((
unsigned
long
long
)
v
);
}
FORCE_INLINE
(
vec_dword2
)
vec_splats
(
int64
v
)
{
return
vec_splats
((
long
long
)
v
);
}
#endif
// Implement store vector bool char for XLC
#if defined(__IBMCPP__) && defined(__clang__)
FORCE_INLINE
(
void
)
vec_xst
(
const
vec_bchar16
&
vec
,
long
o
,
uchar
*
p
)
{
vec_xst
(
vec_uchar16_c
(
vec
),
VSX_OFFSET
(
o
,
p
),
p
);
}
#endif
// Working around vec_popcnt compatibility
/*
* vec_popcnt should return unsigned but clang has different thought just like gcc in vec_vpopcnt
*
* use vec_popcntu instead to deal with it
*/
#if defined(__clang__) && !defined(__IBMCPP__)
# define VSX_IMPL_CLANG_POPCNTU(Tvec, Tvec2, ucast) \
FORCE_INLINE(Tvec) vec_popcntu(const Tvec2& a) \
{ return ucast(vec_popcnt(a)); }
VSX_IMPL_CLANG_POPCNTU
(
vec_uchar16
,
vec_char16
,
vec_uchar16_c
);
VSX_IMPL_CLANG_POPCNTU
(
vec_ushort8
,
vec_short8
,
vec_ushort8_c
);
VSX_IMPL_CLANG_POPCNTU
(
vec_uint4
,
vec_int4
,
vec_uint4_c
);
// redirect unsigned types
VSX_REDIRECT_1RG
(
vec_uchar16
,
vec_uchar16
,
vec_popcntu
,
vec_popcnt
)
VSX_REDIRECT_1RG
(
vec_ushort8
,
vec_ushort8
,
vec_popcntu
,
vec_popcnt
)
VSX_REDIRECT_1RG
(
vec_uint4
,
vec_uint4
,
vec_popcntu
,
vec_popcnt
)
#else
# define vec_popcntu vec_popcnt
#endif
// Working around vec_cts compatibility
/*
* vec_cts in gcc and clang converts single-precision to signed fixed-point word
* and from double-precision to signed doubleword, also there's no implement for vec_ctsl
*
* vec_cts in xlc converts single and double precision to signed fixed-point word
* and xlc has vec_ctsl which converts single and double precision to signed doubleword
*
* so to deal with this situation, use vec_cts only if you want to convert single-precision to signed fixed-point word
* and use vec_ctsl when you want to convert double-precision to signed doubleword
*
* Also we implemented vec_ctsw(a) to convert double-precision to signed fixed-point word
*/
// converts double-precision to signed doubleword for GCC and CLANG
#if !defined(vec_ctsl) && !defined(__IBMCPP__) && (defined(__GNUG__) || defined(__clang__))
// GCC4 has incorrect results in convert to signed doubleword
# if !defined(__clang__) && __GNUG__ < 5
# define vec_ctsl(a, b) __vec_ctsl(a)
VSX_IMPL_1RG
(
vec_dword2
,
wi
,
vec_double2
,
wd
,
xvcvdpsxds
,
__vec_ctsl
)
# else // GCC > 4 , CLANG
# define vec_ctsl vec_cts
# endif
#endif
// converts double-precision to signed fixed-point word
#if defined(__IBMCPP__)
# define vec_ctsw(a) vec_cts(a, 0)
#else // GCC, CLANG
# define vec_ctsw(a) vec_int4_c(__builtin_vsx_xvcvdpsxws(a))
#endif
// load 4 unsigned bytes into uint4 vector
#define vec_ld_buw(p) vec_uint4_set((p)[0], (p)[1], (p)[2], (p)[3])
...
...
@@ -566,14 +655,14 @@ FORCE_INLINE(Tvec) vec_ldz_l8(const Tp *p)
return vec_and(vec_ld_l8(p), (Tvec)mask); \
}
VSX_IMPL_LOAD_L8
(
vec_uchar16
,
uchar
)
VSX_IMPL_LOAD_L8
(
vec_char16
,
schar
)
VSX_IMPL_LOAD_L8
(
vec_char16
,
schar
)
VSX_IMPL_LOAD_L8
(
vec_ushort8
,
ushort
)
VSX_IMPL_LOAD_L8
(
vec_short8
,
short
)
VSX_IMPL_LOAD_L8
(
vec_uint4
,
uint
)
VSX_IMPL_LOAD_L8
(
vec_int4
,
int
)
VSX_IMPL_LOAD_L8
(
vec_float4
,
float
)
VSX_IMPL_LOAD_L8
(
vec_short8
,
short
)
VSX_IMPL_LOAD_L8
(
vec_uint4
,
uint
)
VSX_IMPL_LOAD_L8
(
vec_int4
,
int
)
VSX_IMPL_LOAD_L8
(
vec_float4
,
float
)
VSX_IMPL_LOAD_L8
(
vec_udword2
,
uint64
)
VSX_IMPL_LOAD_L8
(
vec_dword2
,
int64
)
VSX_IMPL_LOAD_L8
(
vec_dword2
,
int64
)
VSX_IMPL_LOAD_L8
(
vec_double2
,
double
)
// logical not
...
...
@@ -601,41 +690,45 @@ FORCE_INLINE(rt) vec_unpackhu(const rg& a) \
{ return reinterpret_cast<rt>(vec_mergeh(a, zero)); }
VSX_IMPL_UNPACKU
(
vec_ushort8
,
vec_uchar16
,
vec_uchar16_z
)
VSX_IMPL_UNPACKU
(
vec_uint4
,
vec_ushort8
,
vec_ushort8_z
)
VSX_IMPL_UNPACKU
(
vec_udword2
,
vec_uint4
,
vec_uint4_z
)
VSX_IMPL_UNPACKU
(
vec_uint4
,
vec_ushort8
,
vec_ushort8_z
)
VSX_IMPL_UNPACKU
(
vec_udword2
,
vec_uint4
,
vec_uint4_z
)
/*
* Implement vec_mergesqe and vec_mergesqo
* Merges the sequence values of even and odd elements of two vectors
*/
#define VSX_IMPL_PERM(rt, fnm, ...) \
FORCE_INLINE(rt) fnm(const rt& a, const rt& b) \
{ static const vec_uchar16 perm = {__VA_ARGS__}; return vec_perm(a, b, perm); }
// 16
#define perm16_mergesqe 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
#define perm16_mergesqo 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
VSX_IMPL_PERM
(
vec_uchar16
,
vec_mergesqe
,
perm16_mergesqe
)
VSX_IMPL_PERM
(
vec_uchar16
,
vec_mergesqo
,
perm16_mergesqo
)
VSX_IMPL_PERM
(
vec_char16
,
vec_mergesqe
,
perm16_mergesqe
)
VSX_IMPL_PERM
(
vec_char16
,
vec_mergesqo
,
perm16_mergesqo
)
VSX_IMPL_PERM
(
vec_char16
,
vec_mergesqe
,
perm16_mergesqe
)
VSX_IMPL_PERM
(
vec_char16
,
vec_mergesqo
,
perm16_mergesqo
)
// 8
#define perm8_mergesqe 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
#define perm8_mergesqo 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
VSX_IMPL_PERM
(
vec_ushort8
,
vec_mergesqe
,
perm8_mergesqe
)
VSX_IMPL_PERM
(
vec_ushort8
,
vec_mergesqo
,
perm8_mergesqo
)
VSX_IMPL_PERM
(
vec_short8
,
vec_mergesqe
,
perm8_mergesqe
)
VSX_IMPL_PERM
(
vec_short8
,
vec_mergesqo
,
perm8_mergesqo
)
VSX_IMPL_PERM
(
vec_short8
,
vec_mergesqe
,
perm8_mergesqe
)
VSX_IMPL_PERM
(
vec_short8
,
vec_mergesqo
,
perm8_mergesqo
)
// 4
#define perm4_mergesqe 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
#define perm4_mergesqo 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
VSX_IMPL_PERM
(
vec_uint4
,
vec_mergesqe
,
perm4_mergesqe
)
VSX_IMPL_PERM
(
vec_uint4
,
vec_mergesqo
,
perm4_mergesqo
)
VSX_IMPL_PERM
(
vec_int4
,
vec_mergesqe
,
perm4_mergesqe
)
VSX_IMPL_PERM
(
vec_int4
,
vec_mergesqo
,
perm4_mergesqo
)
VSX_IMPL_PERM
(
vec_uint4
,
vec_mergesqe
,
perm4_mergesqe
)
VSX_IMPL_PERM
(
vec_uint4
,
vec_mergesqo
,
perm4_mergesqo
)
VSX_IMPL_PERM
(
vec_int4
,
vec_mergesqe
,
perm4_mergesqe
)
VSX_IMPL_PERM
(
vec_int4
,
vec_mergesqo
,
perm4_mergesqo
)
VSX_IMPL_PERM
(
vec_float4
,
vec_mergesqe
,
perm4_mergesqe
)
VSX_IMPL_PERM
(
vec_float4
,
vec_mergesqo
,
perm4_mergesqo
)
// 2
VSX_REDIRECT_2RG
(
vec_double2
,
vec_double2
,
vec_mergesqe
,
vec_mergeh
)
VSX_REDIRECT_2RG
(
vec_double2
,
vec_double2
,
vec_mergesqo
,
vec_mergel
)
VSX_REDIRECT_2RG
(
vec_dword2
,
vec_dword2
,
vec_mergesqe
,
vec_mergeh
)
VSX_REDIRECT_2RG
(
vec_dword2
,
vec_dword2
,
vec_mergesqo
,
vec_mergel
)
VSX_REDIRECT_2RG
(
vec_dword2
,
vec_dword2
,
vec_mergesqe
,
vec_mergeh
)
VSX_REDIRECT_2RG
(
vec_dword2
,
vec_dword2
,
vec_mergesqo
,
vec_mergel
)
VSX_REDIRECT_2RG
(
vec_udword2
,
vec_udword2
,
vec_mergesqe
,
vec_mergeh
)
VSX_REDIRECT_2RG
(
vec_udword2
,
vec_udword2
,
vec_mergesqo
,
vec_mergel
)
...
...
@@ -657,8 +750,8 @@ VSX_IMPL_MERGESQHL(vec_int4)
VSX_IMPL_MERGESQHL
(
vec_float4
)
VSX_REDIRECT_2RG
(
vec_udword2
,
vec_udword2
,
vec_mergesqh
,
vec_mergeh
)
VSX_REDIRECT_2RG
(
vec_udword2
,
vec_udword2
,
vec_mergesql
,
vec_mergel
)
VSX_REDIRECT_2RG
(
vec_dword2
,
vec_dword2
,
vec_mergesqh
,
vec_mergeh
)
VSX_REDIRECT_2RG
(
vec_dword2
,
vec_dword2
,
vec_mergesql
,
vec_mergel
)
VSX_REDIRECT_2RG
(
vec_dword2
,
vec_dword2
,
vec_mergesqh
,
vec_mergeh
)
VSX_REDIRECT_2RG
(
vec_dword2
,
vec_dword2
,
vec_mergesql
,
vec_mergel
)
VSX_REDIRECT_2RG
(
vec_double2
,
vec_double2
,
vec_mergesqh
,
vec_mergeh
)
VSX_REDIRECT_2RG
(
vec_double2
,
vec_double2
,
vec_mergesql
,
vec_mergel
)
...
...
@@ -682,13 +775,13 @@ FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
vsx_stf(vec_mergeh(ac, bd), 32, ptr); \
vsx_stf(vec_mergel(ac, bd), 48, ptr); \
}
VSX_IMPL_ST_INTERLEAVE
(
uchar
,
vec_uchar16
)
VSX_IMPL_ST_INTERLEAVE
(
schar
,
vec_char16
)
VSX_IMPL_ST_INTERLEAVE
(
uchar
,
vec_uchar16
)
VSX_IMPL_ST_INTERLEAVE
(
schar
,
vec_char16
)
VSX_IMPL_ST_INTERLEAVE
(
ushort
,
vec_ushort8
)
VSX_IMPL_ST_INTERLEAVE
(
short
,
vec_short8
)
VSX_IMPL_ST_INTERLEAVE
(
uint
,
vec_uint4
)
VSX_IMPL_ST_INTERLEAVE
(
int
,
vec_int4
)
VSX_IMPL_ST_INTERLEAVE
(
float
,
vec_float4
)
VSX_IMPL_ST_INTERLEAVE
(
short
,
vec_short8
)
VSX_IMPL_ST_INTERLEAVE
(
uint
,
vec_uint4
)
VSX_IMPL_ST_INTERLEAVE
(
int
,
vec_int4
)
VSX_IMPL_ST_INTERLEAVE
(
float
,
vec_float4
)
// 2 and 4 channels deinterleave for 16 lanes
#define VSX_IMPL_ST_DINTERLEAVE_8(Tp, Tvec) \
...
...
@@ -748,7 +841,7 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
d = vec_mergesql(cd0, cd1); \
}
VSX_IMPL_ST_DINTERLEAVE_16
(
ushort
,
vec_ushort8
)
VSX_IMPL_ST_DINTERLEAVE_16
(
short
,
vec_short8
)
VSX_IMPL_ST_DINTERLEAVE_16
(
short
,
vec_short8
)
// 2 and 4 channels deinterleave for 4 lanes
#define VSX_IMPL_ST_DINTERLEAVE_32(Tp, Tvec) \
...
...
@@ -777,8 +870,8 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
c = vec_mergeh(m0, m1); \
d = vec_mergel(m0, m1); \
}
VSX_IMPL_ST_DINTERLEAVE_32
(
uint
,
vec_uint4
)
VSX_IMPL_ST_DINTERLEAVE_32
(
int
,
vec_int4
)
VSX_IMPL_ST_DINTERLEAVE_32
(
uint
,
vec_uint4
)
VSX_IMPL_ST_DINTERLEAVE_32
(
int
,
vec_int4
)
VSX_IMPL_ST_DINTERLEAVE_32
(
float
,
vec_float4
)
// 2 and 4 channels interleave and deinterleave for 2 lanes
...
...
@@ -815,9 +908,9 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, \
c = vec_mergeh(v0, v1); \
d = vec_mergel(v0, v1); \
}
VSX_IMPL_ST_D_INTERLEAVE_64
(
int64
,
vec_dword2
,
vsx_ld2
,
vsx_st2
)
VSX_IMPL_ST_D_INTERLEAVE_64
(
int64
,
vec_dword2
,
vsx_ld2
,
vsx_st2
)
VSX_IMPL_ST_D_INTERLEAVE_64
(
uint64
,
vec_udword2
,
vsx_ld2
,
vsx_st2
)
VSX_IMPL_ST_D_INTERLEAVE_64
(
double
,
vec_double2
,
vsx_ld
,
vsx_st
)
VSX_IMPL_ST_D_INTERLEAVE_64
(
double
,
vec_double2
,
vsx_ld
,
vsx_st
)
/* 3 channels */
#define VSX_IMPL_ST_INTERLEAVE_3CH_16(Tp, Tvec) \
...
...
@@ -882,7 +975,7 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c)
c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm); \
}
VSX_IMPL_ST_INTERLEAVE_3CH_8
(
ushort
,
vec_ushort8
)
VSX_IMPL_ST_INTERLEAVE_3CH_8
(
short
,
vec_short8
)
VSX_IMPL_ST_INTERLEAVE_3CH_8
(
short
,
vec_short8
)
#define VSX_IMPL_ST_INTERLEAVE_3CH_4(Tp, Tvec) \
FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b, \
...
...
@@ -907,8 +1000,8 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c)
b = vec_perm(v2, vec_sld(v1, v3, 8), flp2); \
c = vec_perm(vec_sld(v2, v1, 8), v3, flp); \
}
VSX_IMPL_ST_INTERLEAVE_3CH_4
(
uint
,
vec_uint4
)
VSX_IMPL_ST_INTERLEAVE_3CH_4
(
int
,
vec_int4
)
VSX_IMPL_ST_INTERLEAVE_3CH_4
(
uint
,
vec_uint4
)
VSX_IMPL_ST_INTERLEAVE_3CH_4
(
int
,
vec_int4
)
VSX_IMPL_ST_INTERLEAVE_3CH_4
(
float
,
vec_float4
)
#define VSX_IMPL_ST_INTERLEAVE_3CH_2(Tp, Tvec, ld_func, st_func) \
...
...
@@ -929,9 +1022,9 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, \
b = vec_permi(v1, v3, 2); \
c = vec_permi(v2, v3, 1); \
}
VSX_IMPL_ST_INTERLEAVE_3CH_2
(
int64
,
vec_dword2
,
vsx_ld2
,
vsx_st2
)
VSX_IMPL_ST_INTERLEAVE_3CH_2
(
int64
,
vec_dword2
,
vsx_ld2
,
vsx_st2
)
VSX_IMPL_ST_INTERLEAVE_3CH_2
(
uint64
,
vec_udword2
,
vsx_ld2
,
vsx_st2
)
VSX_IMPL_ST_INTERLEAVE_3CH_2
(
double
,
vec_double2
,
vsx_ld
,
vsx_st
)
VSX_IMPL_ST_INTERLEAVE_3CH_2
(
double
,
vec_double2
,
vsx_ld
,
vsx_st
)
#endif // CV_VSX
...
...
modules/core/src/rand.cpp
View file @
751cee8e
...
...
@@ -74,6 +74,12 @@ namespace cv
#define RNG_NEXT(x) ((uint64)(unsigned)(x)*CV_RNG_COEFF + ((x) >> 32))
#ifdef __PPC64__
#define PPC_MUL_ADD(ret, tmp, p0, p1) \
asm volatile("fmuls %0,%1,%2\n\t fadds %0,%0,%3" : "=&f" (ret) \
: "f" (tmp), "f" (p0), "f" (p1))
#endif
/***************************************************************************************\
* Pseudo-Random Number Generators (PRNGs) *
\***************************************************************************************/
...
...
@@ -248,6 +254,14 @@ static void randf_32f( float* arr, int len, uint64* state, const Vec2f* p, bool
volatile
float32x4_t
v0
=
vmulq_f32
(
vld1q_f32
(
f
),
p0
);
vst1q_f32
(
arr
+
i
,
vaddq_f32
(
v0
,
p1
));
#elif defined __PPC64__
// inline asm is required for numerical stability!
// compilers tends to use floating multiply-add single(fmadds)
// instead of separate multiply and add
PPC_MUL_ADD
(
arr
[
i
+
0
],
f
[
0
],
p
[
i
+
0
][
0
],
p
[
i
+
0
][
1
]);
PPC_MUL_ADD
(
arr
[
i
+
1
],
f
[
1
],
p
[
i
+
1
][
0
],
p
[
i
+
1
][
1
]);
PPC_MUL_ADD
(
arr
[
i
+
2
],
f
[
2
],
p
[
i
+
2
][
0
],
p
[
i
+
2
][
1
]);
PPC_MUL_ADD
(
arr
[
i
+
3
],
f
[
3
],
p
[
i
+
3
][
0
],
p
[
i
+
3
][
1
]);
#else
arr
[
i
+
0
]
=
f
[
0
]
*
p
[
i
+
0
][
0
]
+
p
[
i
+
0
][
1
];
arr
[
i
+
1
]
=
f
[
1
]
*
p
[
i
+
1
][
0
]
+
p
[
i
+
1
][
1
];
...
...
@@ -269,6 +283,8 @@ static void randf_32f( float* arr, int len, uint64* state, const Vec2f* p, bool
vdup_n_f32
((
float
)(
int
)
temp
),
vdup_n_f32
(
p
[
i
][
0
])),
vdup_n_f32
(
p
[
i
][
1
]));
arr
[
i
]
=
vget_lane_f32
(
t
,
0
);
#elif defined __PPC64__
PPC_MUL_ADD
(
arr
[
i
],
(
float
)(
int
)
temp
,
p
[
i
][
0
],
p
[
i
][
1
]);
#else
arr
[
i
]
=
(
int
)
temp
*
p
[
i
][
0
]
+
p
[
i
][
1
];
#endif
...
...
platforms/linux/ppc64-gnu.toolchain.cmake
0 → 100644
View file @
751cee8e
set
(
CMAKE_SYSTEM_PROCESSOR ppc64
)
set
(
GNU_MACHINE
"powerpc64-linux-gnu"
CACHE STRING
"GNU compiler triple"
)
include
(
"
${
CMAKE_CURRENT_LIST_DIR
}
/ppcat.toolchain.cmake"
)
platforms/linux/ppc64le-gnu.toolchain.cmake
0 → 100644
View file @
751cee8e
set
(
CMAKE_SYSTEM_PROCESSOR ppc64le
)
set
(
GNU_MACHINE
"powerpc64le-linux-gnu"
CACHE STRING
"GNU compiler triple"
)
include
(
"
${
CMAKE_CURRENT_LIST_DIR
}
/ppcat.toolchain.cmake"
)
platforms/linux/ppcat.toolchain.cmake
0 → 100644
View file @
751cee8e
if
(
COMMAND toolchain_save_config
)
return
()
# prevent recursive call
endif
()
option
(
AT_PATH
"Advance Toolchain directory"
""
)
option
(
AT_RPATH
"Add new directories to runtime search path"
""
)
option
(
AT_HOST_LINK
"Enable/disable Link against host advance toolchain runtime"
OFF
)
option
(
AT_NO_AUTOVEC
"Disable/enable Auto Vectorizer optimization"
OFF
)
set
(
CMAKE_SYSTEM_NAME Linux
)
set
(
CMAKE_SYSTEM_VERSION 1
)
include
(
"
${
CMAKE_CURRENT_LIST_DIR
}
/gnu.toolchain.cmake"
)
if
(
NOT DEFINED CMAKE_C_COMPILER
)
string
(
REGEX REPLACE
"/+$"
""
AT_PATH
"
${
AT_PATH
}
"
)
if
(
NOT AT_PATH
)
message
(
FATAL_ERROR
"'AT_PATH' option is required. Please set it to Advance Toolchain path to get toolchain works"
)
endif
()
if
(
NOT EXISTS
${
AT_PATH
}
)
message
(
FATAL_ERROR
"'
${
AT_PATH
}
' Advance Toolchain path isn't exist"
)
endif
()
set
(
CMAKE_C_COMPILER
"
${
AT_PATH
}
/bin/
${
GNU_MACHINE
}
-gcc"
)
if
(
NOT EXISTS
${
CMAKE_C_COMPILER
}
)
message
(
FATAL_ERROR
"GNU C compiler isn't exist on path '
${
CMAKE_C_COMPILER
}
'. Please install Advance Toolchain with
${
CMAKE_SYSTEM_PROCESSOR
}
supports"
)
endif
()
endif
()
if
(
NOT DEFINED CMAKE_CXX_COMPILER
)
set
(
CMAKE_CXX_COMPILER
"
${
AT_PATH
}
/bin/
${
GNU_MACHINE
}
-g++"
)
if
(
NOT EXISTS
${
CMAKE_CXX_COMPILER
}
)
message
(
FATAL_ERROR
"GNU C++ compiler isn't exist. Invalid install of Advance Toolchain"
)
endif
()
endif
()
if
(
NOT DEFINED AT_GCCROOT_PATH
)
set
(
AT_GCCROOT_PATH
"
${
AT_PATH
}
/
${
GNU_MACHINE
}
"
)
if
(
NOT EXISTS
${
AT_GCCROOT_PATH
}
)
message
(
FATAL_ERROR
"GCC root path '
${
AT_GCCROOT_PATH
}
' isn't exist. Invalid install of Advance Toolchain"
)
endif
()
endif
()
if
(
NOT DEFINED AT_SYSROOT_PATH
)
if
(
${
CMAKE_SYSTEM_PROCESSOR
}
STREQUAL
"ppc64"
)
set
(
AT_SYSROOT_PATH
"
${
AT_PATH
}
/ppc"
)
else
()
set
(
AT_SYSROOT_PATH
"
${
AT_PATH
}
/
${
CMAKE_SYSTEM_PROCESSOR
}
"
)
endif
()
if
(
NOT EXISTS
${
AT_SYSROOT_PATH
}
)
message
(
FATAL_ERROR
"System root path '
${
AT_SYSROOT_PATH
}
' isn't exist. Invalid install of Advance Toolchain"
)
endif
()
endif
()
if
(
NOT DEFINED CMAKE_EXE_LINKER_FLAGS
)
set
(
CMAKE_CXX_FLAGS
""
CACHE INTERAL
""
)
set
(
CMAKE_C_FLAGS
""
CACHE INTERAL
""
)
set
(
CMAKE_EXE_LINKER_FLAGS
""
CACHE INTERAL
""
)
set
(
CMAKE_SHARED_LINKER_FLAGS
""
CACHE INTERAL
""
)
set
(
CMAKE_MODULE_LINKER_FLAGS
""
CACHE INTERAL
""
)
if
(
AT_RPATH
)
string
(
REPLACE
","
";"
RPATH_LIST
${
AT_RPATH
}
)
endif
()
if
(
AT_HOST_LINK
)
#get 64-bit dynamic linker path
file
(
STRINGS
"
${
AT_SYSROOT_PATH
}
/usr/bin/ldd"
RTLDLIST LIMIT_COUNT 1 REGEX
"^RTLDLIST=[
\"
*
\"
]"
)
string
(
REGEX REPLACE
"RTLDLIST=|
\"
"
""
RTLDLIST
"
${
RTLDLIST
}
"
)
string
(
REPLACE
" "
";"
RTLDLIST
"
${
RTLDLIST
}
"
)
#RTLDLIST must contains 32 and 64 bit paths
list
(
LENGTH RTLDLIST RTLDLIST_LEN
)
if
(
NOT RTLDLIST_LEN GREATER 1
)
message
(
FATAL_ERROR
"Could not fetch dynamic linker path. Invalid install of Advance Toolchain"
)
endif
()
list
(
GET RTLDLIST 1 LINKER_PATH
)
set
(
CMAKE_EXE_LINKER_FLAGS
"-Wl,--dynamic-linker=
${
AT_SYSROOT_PATH
}${
LINKER_PATH
}
"
)
list
(
APPEND RPATH_LIST
"
${
AT_GCCROOT_PATH
}
/lib64/"
)
list
(
APPEND RPATH_LIST
"
${
AT_SYSROOT_PATH
}
/lib64/"
)
list
(
APPEND RPATH_LIST
"
${
AT_SYSROOT_PATH
}
/usr/lib64/"
)
list
(
APPEND RPATH_LIST
"
${
PROJECT_BINARY_DIR
}
/lib/"
)
endif
()
list
(
LENGTH RPATH_LIST RPATH_LEN
)
if
(
RPATH_LEN GREATER 0
)
set
(
AT_LINKER_FLAGS
"
${
AT_LINKER_FLAGS
}
-Wl"
)
foreach
(
RPATH
${
RPATH_LIST
}
)
set
(
AT_LINKER_FLAGS
"
${
AT_LINKER_FLAGS
}
,-rpath,
${
RPATH
}
"
)
endforeach
()
endif
()
set
(
CMAKE_SHARED_LINKER_FLAGS
"
${
AT_LINKER_FLAGS
}
${
CMAKE_SHARED_LINKER_FLAGS
}
"
)
set
(
CMAKE_MODULE_LINKER_FLAGS
"
${
AT_LINKER_FLAGS
}
${
CMAKE_MODULE_LINKER_FLAGS
}
"
)
set
(
CMAKE_EXE_LINKER_FLAGS
"
${
AT_LINKER_FLAGS
}
${
CMAKE_EXE_LINKER_FLAGS
}
"
)
if
(
AT_NO_AUTOVEC
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fno-tree-vectorize -fno-tree-slp-vectorize"
)
set
(
CMAKE_C_FLAGS
"
${
CMAKE_C_FLAGS
}
-fno-tree-vectorize -fno-tree-slp-vectorize"
)
endif
()
endif
()
set
(
CMAKE_FIND_ROOT_PATH
${
CMAKE_FIND_ROOT_PATH
}
${
AT_SYSROOT_PATH
}
${
AT_GCCROOT_PATH
}
)
set
(
CMAKE_SYSROOT
${
AT_SYSROOT_PATH
}
)
# what about ld.gold?
if
(
NOT DEFINED CMAKE_LINKER
)
find_program
(
CMAKE_LINKER NAMES ld
)
endif
()
if
(
NOT DEFINED CMAKE_AR
)
find_program
(
CMAKE_AR NAMES ar
)
endif
()
set
(
TOOLCHAIN_CONFIG_VARS
${
TOOLCHAIN_CONFIG_VARS
}
CMAKE_SYSROOT
AT_SYSROOT_PATH
AT_GCCROOT_PATH
)
toolchain_save_config
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment