Merge pull request #9907 from seiko2plus:vsxFixesImproves

751cee8e · Maksim Shabunin · 0608227e · 2dc76d50 · 751cee8e · 751cee8e
Commit 751cee8e authored Nov 16, 2017 by Maksim Shabunin
6 changed files
--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -523,24 +523,25 @@ OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_add_wrap, vec_add)
 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_sub_wrap, vec_sub)
 /** Bitwise shifts **/
-#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpuvec, splfunc)   \
+#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc)   \
-inline _Tpuvec operator << (const _Tpuvec& a, int imm)   \
+inline _Tpvec operator << (const _Tpvec& a, int imm)         \
-{ return _Tpuvec(vec_sl(a.val, splfunc(imm))); }         \
+{ return _Tpvec(vec_sl(a.val, splfunc(imm))); }              \
-inline _Tpuvec operator >> (const _Tpuvec& a, int imm)   \
+inline _Tpvec operator >> (const _Tpvec& a, int imm)         \
-{ return _Tpuvec(vec_sr(a.val, splfunc(imm))); }         \
+{ return _Tpvec(shr(a.val, splfunc(imm))); }                 \
-template<int imm> inline _Tpuvec v_shl(const _Tpuvec& a) \
+template<int imm> inline _Tpvec v_shl(const _Tpvec& a)       \
-{ return _Tpuvec(vec_sl(a.val, splfunc(imm))); }         \
+{ return _Tpvec(vec_sl(a.val, splfunc(imm))); }              \
-template<int imm> inline _Tpuvec v_shr(const _Tpuvec& a) \
+template<int imm> inline _Tpvec v_shr(const _Tpvec& a)       \
-{ return _Tpuvec(vec_sr(a.val, splfunc(imm))); }
+{ return _Tpvec(shr(a.val, splfunc(imm))); }
-OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint8x16, vec_uchar16_sp)
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint8x16, vec_sr, vec_uchar16_sp)
-OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int8x16, vec_uchar16_sp)
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint16x8, vec_sr, vec_ushort8_sp)
-OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint16x8, vec_ushort8_sp)
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint32x4, vec_sr, vec_uint4_sp)
-OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int16x8, vec_ushort8_sp)
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint64x2, vec_sr, vec_udword2_sp)
-OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint32x4, vec_uint4_sp)
+// algebraic right shift
-OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int32x4, vec_uint4_sp)
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int8x16, vec_sra, vec_uchar16_sp)
-OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint64x2, vec_udword2_sp)
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int16x8, vec_sra, vec_ushort8_sp)
-OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_udword2_sp)
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int32x4, vec_sra, vec_uint4_sp)
+OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_sra, vec_udword2_sp)
 /** Bitwise logic **/
 #define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec)    \
@@ -605,6 +606,64 @@ OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int64x2)
 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_min, vec_min)
 OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_max, vec_max)
+/** Rotate **/
+#define OPENCV_IMPL_VSX_ROTATE(_Tpvec, suffix, shf, cast)                       \
+template<int imm>                                                               \
+inline _Tpvec v_rotate_##suffix(const _Tpvec& a)                                \
+{                                                                               \
+    const int wd = imm * sizeof(typename _Tpvec::lane_type);                    \
+    if (wd > 15)                                                                \
+        return _Tpvec();                                                        \
+    return _Tpvec((cast)shf(vec_uchar16_c(a.val), vec_uchar16_sp(wd << 3)));    \
+}
+#define OPENCV_IMPL_VSX_ROTATE_LR(_Tpvec, cast)     \
+OPENCV_IMPL_VSX_ROTATE(_Tpvec, left, vec_slo, cast) \
+OPENCV_IMPL_VSX_ROTATE(_Tpvec, right, vec_sro, cast)
+OPENCV_IMPL_VSX_ROTATE_LR(v_uint8x16, vec_uchar16)
+OPENCV_IMPL_VSX_ROTATE_LR(v_int8x16,  vec_char16)
+OPENCV_IMPL_VSX_ROTATE_LR(v_uint16x8, vec_ushort8)
+OPENCV_IMPL_VSX_ROTATE_LR(v_int16x8,  vec_short8)
+OPENCV_IMPL_VSX_ROTATE_LR(v_uint32x4, vec_uint4)
+OPENCV_IMPL_VSX_ROTATE_LR(v_int32x4,  vec_int4)
+OPENCV_IMPL_VSX_ROTATE_LR(v_uint64x2, vec_udword2)
+OPENCV_IMPL_VSX_ROTATE_LR(v_int64x2,  vec_dword2)
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)
+{
+    const int wd = imm * sizeof(typename _Tpvec::lane_type);
+    if (wd == 0)
+        return a;
+    return _Tpvec(vec_sld(b.val, a.val, 16 - wd));
+}
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)
+{
+    const int wd = imm * sizeof(typename _Tpvec::lane_type);
+    if (wd == 16)
+        return b;
+    return _Tpvec(vec_sld(a.val, b.val, wd));
+}
+#define OPENCV_IMPL_VSX_ROTATE_64(_Tpvec, suffix, rg1, rg2)       \
+template<int imm>                                                 \
+inline _Tpvec v_rotate_##suffix(const _Tpvec& a, const _Tpvec& b) \
+{                                                                 \
+    if (imm == 1)                                                 \
+        return _Tpvec(vec_permi(rg1.val, rg2.val, 2));            \
+    return imm ? b : a;                                           \
+}
+OPENCV_IMPL_VSX_ROTATE_64(v_int64x2,  right, a, b)
+OPENCV_IMPL_VSX_ROTATE_64(v_uint64x2, right, a, b)
+OPENCV_IMPL_VSX_ROTATE_64(v_int64x2,  left, b, a)
+OPENCV_IMPL_VSX_ROTATE_64(v_uint64x2, left, b, a)
 ////////// Reduce and mask /////////
 /** Reduce **/
@@ -726,7 +785,7 @@ inline int v_signmask(const v_float32x4& a)
 inline int v_signmask(const v_int64x2& a)
 {
-    const vec_dword2 sv = vec_sr(a.val, vec_udword2_sp(63));
+    VSX_UNUSED(const vec_dword2) sv = vec_sr(a.val, vec_udword2_sp(63));
    return (int)vec_extract(sv, 0) | (int)vec_extract(sv, 1) << 1;
 }
 inline int v_signmask(const v_uint64x2& a)
@@ -812,66 +871,47 @@ OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int64x2, v_uint64x2, vec_udword2_c, v_absdiff, v
 /** Rounding **/
 inline v_int32x4 v_round(const v_float32x4& a)
-{ return v_int32x4(vec_cts(vec_round(a.val), 0)); }
+{ return v_int32x4(vec_cts(vec_round(a.val))); }
 inline v_int32x4 v_round(const v_float64x2& a)
-{
+{ return v_int32x4(vec_mergesqo(vec_cts(vec_round(a.val)), vec_int4_z)); }
-    static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0};
-    return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(vec_round(a.val)), perm));
-}
 inline v_int32x4 v_floor(const v_float32x4& a)
-{ return v_int32x4(vec_cts(vec_floor(a.val), 0)); }
+{ return v_int32x4(vec_cts(vec_floor(a.val))); }
 inline v_int32x4 v_floor(const v_float64x2& a)
-{
+{ return v_int32x4(vec_mergesqo(vec_cts(vec_floor(a.val)), vec_int4_z)); }
-    static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0};
-    return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(vec_floor(a.val)), perm));
-}
 inline v_int32x4 v_ceil(const v_float32x4& a)
-{ return v_int32x4(vec_cts(vec_ceil(a.val), 0)); }
+{ return v_int32x4(vec_cts(vec_ceil(a.val))); }
 inline v_int32x4 v_ceil(const v_float64x2& a)
-{
+{ return v_int32x4(vec_mergesqo(vec_cts(vec_ceil(a.val)), vec_int4_z)); }
-    static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0};
-    return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(vec_ceil(a.val)), perm));
-}
 inline v_int32x4 v_trunc(const v_float32x4& a)
-{ return v_int32x4(vec_cts(a.val, 0)); }
+{ return v_int32x4(vec_cts(a.val)); }
 inline v_int32x4 v_trunc(const v_float64x2& a)
-{
+{ return v_int32x4(vec_mergesqo(vec_cts(a.val), vec_int4_z)); }
-    static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0};
-    return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(a.val), perm));
-}
 /** To float **/
 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
-{ return v_float32x4(vec_ctf(a.val, 0)); }
+{ return v_float32x4(vec_ctf(a.val)); }
 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
-{
+{ return v_float32x4(vec_mergesqo(vec_cvf(a.val), vec_float4_z)); }
-    static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0};
-    return v_float32x4(vec_perm(vec_float4_z, vec_cvf(a.val), perm));
-}
 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
-{
+{ return v_float64x2(vec_ctdo(vec_mergeh(a.val, a.val))); }
-    return v_float64x2(vec_ctd(vec_mergeh(a.val, a.val), 0));
-}
 inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
-{
+{ return v_float64x2(vec_ctdo(vec_mergel(a.val, a.val))); }
-    return v_float64x2(vec_ctd(vec_mergel(a.val, a.val), 0));
-}
 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
-{
+{ return v_float64x2(vec_cvfo(vec_mergeh(a.val, a.val))); }
-    return v_float64x2(vec_cvf(vec_mergeh(a.val, a.val)));
-}
 inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
-{
+{ return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }
-    return v_float64x2(vec_cvf(vec_mergel(a.val, a.val)));
-}
 /** Reinterpret **/
 /** its up there with load and store operations **/
@@ -888,10 +928,20 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
    const vec_float4 v0 = vec_splat(v.val, 0);
    const vec_float4 v1 = vec_splat(v.val, 1);
    const vec_float4 v2 = vec_splat(v.val, 2);
-    const vec_float4 v3 = vec_splat(v.val, 3);
+    VSX_UNUSED(const vec_float4) v3 = vec_splat(v.val, 3);
    return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, vec_mul(v3, m3.val)))));
 }
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2,
+                               const v_float32x4& a)
+{
+    const vec_float4 v0 = vec_splat(v.val, 0);
+    const vec_float4 v1 = vec_splat(v.val, 1);
+    const vec_float4 v2 = vec_splat(v.val, 2);
+    return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, a.val))));
+}
 #define OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(_Tpvec, _Tpvec2)                        \
 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1,                   \
                           const _Tpvec& a2, const _Tpvec& a3,                   \

--- a/modules/core/include/opencv2/core/vsx_utils.hpp
+++ b/modules/core/include/opencv2/core/vsx_utils.hpp
@@ -51,18 +51,6 @@
 //! @{
 #if CV_VSX
-#define FORCE_INLINE(tp) extern inline tp __attribute__((always_inline))
-#define VSX_REDIRECT_1RG(rt, rg, fnm, fn2)   \
-FORCE_INLINE(rt) fnm(const rg& a) { return fn2(a); }
-#define VSX_REDIRECT_2RG(rt, rg, fnm, fn2)   \
-FORCE_INLINE(rt) fnm(const rg& a, const rg& b) { return fn2(a, b); }
-#define VSX_IMPL_PERM(rt, fnm, ...)            \
-FORCE_INLINE(rt) fnm(const rt& a, const rt& b) \
-    { static const vec_uchar16 perm = {__VA_ARGS__}; return vec_perm(a, b, perm); }
 #define __VSX_S16__(c, v) (c){v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}
 #define __VSX_S8__(c, v)  (c){v, v, v, v, v, v, v, v}
 #define __VSX_S4__(c, v)  (c){v, v, v, v}
@@ -172,10 +160,19 @@ typedef  __vector double vec_double2;
 #define vec_bdword2_f         (__VSX_S2__(vec_bdword2, 0))
 #define vec_bdword2_t         (__VSX_S2__(vec_bdword2, 1))
+#define FORCE_INLINE(tp) extern inline tp __attribute__((always_inline))
+#define VSX_REDIRECT_1RG(rt, rg, fnm, fn2)   \
+FORCE_INLINE(rt) fnm(const rg& a) { return fn2(a); }
+#define VSX_REDIRECT_2RG(rt, rg, fnm, fn2)   \
+FORCE_INLINE(rt) fnm(const rg& a, const rg& b) { return fn2(a, b); }
 /*
 * GCC VSX compatibility
 **/
-#if defined(__GNUG__) && !defined(__IBMCPP__) && !defined(__clang__)
+#if defined(__GNUG__) && !defined(__clang__)
 // inline asm helper
 #define VSX_IMPL_1RG(rt, rto, rg, rgo, opc, fnm) \
@@ -193,7 +190,7 @@ FORCE_INLINE(rt) fnm(const rg& a, const rg& b) \
 #define VSX_IMPL_2VRG(rt, rg, opc, fnm) VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%1,%2", fnm)
 #if __GNUG__ < 7
-/* up to GCC 6 vec_mul only supports precisions and llong */
+// up to GCC 6 vec_mul only supports precisions and llong
 #   ifdef vec_mul
 #       undef vec_mul
 #   endif
@@ -209,15 +206,15 @@ FORCE_INLINE(rt) fnm(const rg& a, const rg& b) \
                                              8, 9, 24, 25, 12, 13, 28, 29};        \
        return vec_perm(Tcast(vec_mule(a, b)), Tcast(vec_mulo(a, b)), even_perm);   \
    }
-    VSX_IMPL_MULH(vec_short8, vec_short8_c)
+    VSX_IMPL_MULH(vec_short8,  vec_short8_c)
    VSX_IMPL_MULH(vec_ushort8, vec_ushort8_c)
-    /* vmuluwm can be used for unsigned or signed integers, that's what they said */
+    // vmuluwm can be used for unsigned or signed integers, that's what they said
-    VSX_IMPL_2VRG(vec_int4, vec_int4, vmuluwm, vec_mul)
+    VSX_IMPL_2VRG(vec_int4,  vec_int4,  vmuluwm, vec_mul)
    VSX_IMPL_2VRG(vec_uint4, vec_uint4, vmuluwm, vec_mul)
-    /* redirect to GCC builtin vec_mul, since it already supports precisions and llong */
+    // redirect to GCC builtin vec_mul, since it already supports precisions and llong
-    VSX_REDIRECT_2RG(vec_float4, vec_float4, vec_mul, __builtin_vec_mul)
+    VSX_REDIRECT_2RG(vec_float4,  vec_float4,  vec_mul, __builtin_vec_mul)
    VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mul, __builtin_vec_mul)
-    VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mul, __builtin_vec_mul)
+    VSX_REDIRECT_2RG(vec_dword2,  vec_dword2,  vec_mul, __builtin_vec_mul)
    VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mul, __builtin_vec_mul)
 #endif // __GNUG__ < 7
@@ -237,74 +234,120 @@ FORCE_INLINE(rt) fnm(const rg& a, const rg& b) \
 #   define VSX_IMPL_CMPGE(rt, rg, opc, fnm) \
    VSX_IMPL_2VRG_F(rt, rg, #opc" %0,%2,%1\n\t xxlnor %x0,%x0,%x0", fnm)
-    VSX_IMPL_CMPGE(vec_bchar16, vec_char16, vcmpgtsb, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bchar16, vec_char16,  vcmpgtsb, vec_cmpge)
    VSX_IMPL_CMPGE(vec_bchar16, vec_uchar16, vcmpgtub, vec_cmpge)
-    VSX_IMPL_CMPGE(vec_bshort8, vec_short8, vcmpgtsh, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bshort8, vec_short8,  vcmpgtsh, vec_cmpge)
    VSX_IMPL_CMPGE(vec_bshort8, vec_ushort8, vcmpgtuh, vec_cmpge)
-    VSX_IMPL_CMPGE(vec_bint4, vec_int4, vcmpgtsw, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bint4,   vec_int4,    vcmpgtsw, vec_cmpge)
-    VSX_IMPL_CMPGE(vec_bint4, vec_uint4, vcmpgtuw, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bint4,   vec_uint4,   vcmpgtuw, vec_cmpge)
-    VSX_IMPL_CMPGE(vec_bdword2, vec_dword2, vcmpgtsd, vec_cmpge)
+    VSX_IMPL_CMPGE(vec_bdword2, vec_dword2,  vcmpgtsd, vec_cmpge)
    VSX_IMPL_CMPGE(vec_bdword2, vec_udword2, vcmpgtud, vec_cmpge)
-    /* redirect to GCC builtin cmpge, since it already supports precisions */
-    VSX_REDIRECT_2RG(vec_bint4, vec_float4, vec_cmpge, __builtin_vec_cmpge)
+// redirect to GCC builtin cmpge, since it already supports precisions
+    VSX_REDIRECT_2RG(vec_bint4,   vec_float4,  vec_cmpge, __builtin_vec_cmpge)
    VSX_REDIRECT_2RG(vec_bdword2, vec_double2, vec_cmpge, __builtin_vec_cmpge)
 // up to gcc5 vec_nor doesn't support bool long long
 #   undef vec_nor
-template<typename T>
+    template<typename T>
-VSX_REDIRECT_2RG(T, T, vec_nor, __builtin_vec_nor)
+    VSX_REDIRECT_2RG(T, T, vec_nor, __builtin_vec_nor)
-FORCE_INLINE(vec_bdword2) vec_nor(const vec_bdword2& a, const vec_bdword2& b)
+    FORCE_INLINE(vec_bdword2) vec_nor(const vec_bdword2& a, const vec_bdword2& b)
-{ return vec_bdword2_c(__builtin_vec_nor(vec_dword2_c(a), vec_dword2_c(b))); }
+    { return vec_bdword2_c(__builtin_vec_nor(vec_dword2_c(a), vec_dword2_c(b))); }
 #endif // __GNUG__ < 6
-// vector population count
-#ifndef vec_popcnt
-    VSX_IMPL_1VRG(vec_uchar16, vec_uchar16, vpopcntb, vec_popcnt)
-    VSX_IMPL_1VRG(vec_uchar16, vec_char16, vpopcntb, vec_popcnt)
-    VSX_IMPL_1VRG(vec_ushort8, vec_ushort8, vpopcnth, vec_popcnt)
-    VSX_IMPL_1VRG(vec_ushort8, vec_short8, vpopcnth, vec_popcnt)
-    VSX_IMPL_1VRG(vec_uint4, vec_uint4, vpopcntw, vec_popcnt)
-    VSX_IMPL_1VRG(vec_uint4, vec_int4, vpopcntw, vec_popcnt)
-    VSX_IMPL_1VRG(vec_udword2, vec_udword2, vpopcntd, vec_popcnt)
-    VSX_IMPL_1VRG(vec_udword2, vec_dword2, vpopcntd, vec_popcnt)
-#endif // vec_popcnt
 #if __GNUG__ < 5
 // vec_xxpermdi in gcc4 missing little-endian supports just like clang
 #   define vec_permi(a, b, c) vec_xxpermdi(b, a, (3 ^ ((c & 1) << 1 | c >> 1)))
 // vec_packs doesn't support double words in gcc4
-# undef vec_packs
+#   undef vec_packs
-VSX_REDIRECT_2RG(vec_char16, vec_short8, vec_packs, __builtin_vec_packs)
+    VSX_REDIRECT_2RG(vec_char16,  vec_short8,  vec_packs, __builtin_vec_packs)
-VSX_REDIRECT_2RG(vec_uchar16, vec_ushort8, vec_packs, __builtin_vec_packs)
+    VSX_REDIRECT_2RG(vec_uchar16, vec_ushort8, vec_packs, __builtin_vec_packs)
-VSX_REDIRECT_2RG(vec_short8, vec_int4, vec_packs, __builtin_vec_packs)
+    VSX_REDIRECT_2RG(vec_short8,  vec_int4,    vec_packs, __builtin_vec_packs)
-VSX_REDIRECT_2RG(vec_ushort8, vec_uint4, vec_packs, __builtin_vec_packs)
+    VSX_REDIRECT_2RG(vec_ushort8, vec_uint4,   vec_packs, __builtin_vec_packs)
-VSX_IMPL_2VRG_F(vec_int4, vec_dword2, "vpksdss %0,%2,%1", vec_packs)
-VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
+    VSX_IMPL_2VRG_F(vec_int4,  vec_dword2,  "vpksdss %0,%2,%1", vec_packs)
+    VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
 #else
 #   define vec_permi vec_xxpermdi
+#endif // __GNUG__ < 5
+// shift left double by word immediate
+#ifndef vec_sldw
+#   define vec_sldw __builtin_vsx_xxsldwi
 #endif
+// vector population count
+VSX_IMPL_1VRG(vec_uchar16, vec_uchar16, vpopcntb, vec_popcntu)
+VSX_IMPL_1VRG(vec_uchar16, vec_char16,  vpopcntb, vec_popcntu)
+VSX_IMPL_1VRG(vec_ushort8, vec_ushort8, vpopcnth, vec_popcntu)
+VSX_IMPL_1VRG(vec_ushort8, vec_short8,  vpopcnth, vec_popcntu)
+VSX_IMPL_1VRG(vec_uint4,   vec_uint4,   vpopcntw, vec_popcntu)
+VSX_IMPL_1VRG(vec_uint4,   vec_int4,    vpopcntw, vec_popcntu)
+VSX_IMPL_1VRG(vec_udword2, vec_udword2, vpopcntd, vec_popcntu)
+VSX_IMPL_1VRG(vec_udword2, vec_dword2,  vpopcntd, vec_popcntu)
 // converts between single and double-precision
-#ifndef vec_cvf
+#ifdef vec_cvf
-    VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvf, __builtin_vsx_xvcvdpsp)
+#   undef vec_cvf
-    FORCE_INLINE(vec_double2) vec_cvf(const vec_float4& a)
-    { return __builtin_vsx_xvcvspdp(vec_sld(a, a, 4)); }
 #endif
+VSX_REDIRECT_1RG(vec_float4,  vec_double2, vec_cvf,  __builtin_vsx_xvcvdpsp)
+VSX_REDIRECT_1RG(vec_double2, vec_float4,  vec_cvfo, __builtin_vsx_xvcvspdp)
+FORCE_INLINE(vec_double2) vec_cvf(const vec_float4& a)
+{ return vec_cvfo(vec_sldw(a, a, 1)); }
-// converts 32 and 64 bit integers to double-precision
+// converts word and doubleword to double-precision
-#ifndef vec_ctd
+#ifdef vec_ctd
-#   define vec_ctd(a, b) __vec_ctd(a)
+#   undef vec_ctd
-    VSX_IMPL_1RG(vec_double2, wd, vec_int4, wa, xvcvsxwdp, __vec_ctd)
+#endif
-    VSX_IMPL_1RG(vec_double2, wd, vec_uint4, wa, xvcvuxwdp, __vec_ctd)
+VSX_IMPL_1RG(vec_double2, wd, vec_int4,    wa, xvcvsxwdp, vec_ctdo)
-    VSX_IMPL_1RG(vec_double2, wd, vec_dword2, wi, xvcvsxddp, __vec_ctd)
+VSX_IMPL_1RG(vec_double2, wd, vec_uint4,   wa, xvcvuxwdp, vec_ctdo)
-    VSX_IMPL_1RG(vec_double2, wd, vec_udword2, wi, xvcvuxddp, __vec_ctd)
+VSX_IMPL_1RG(vec_double2, wd, vec_dword2,  wi, xvcvsxddp, vec_ctd)
+VSX_IMPL_1RG(vec_double2, wd, vec_udword2, wi, xvcvuxddp, vec_ctd)
+FORCE_INLINE(vec_double2) vec_ctd(const vec_int4& a)
+{ return vec_ctdo(vec_sldw(a, a, 1)); }
+FORCE_INLINE(vec_double2) vec_ctd(const vec_uint4& a)
+{ return vec_ctdo(vec_sldw(a, a, 1)); }
+// converts word and doubleword to single-precision
+#undef vec_ctf
+VSX_IMPL_1RG(vec_float4, wf, vec_int4,    wa, xvcvsxwsp, vec_ctf)
+VSX_IMPL_1RG(vec_float4, wf, vec_uint4,   wa, xvcvuxwsp, vec_ctf)
+VSX_IMPL_1RG(vec_float4, wf, vec_dword2,  wi, xvcvsxdsp, vec_ctf)
+VSX_IMPL_1RG(vec_float4, wf, vec_udword2, wi, xvcvuxdsp, vec_ctf)
+// converts single and double precision to signed word
+#undef vec_cts
+VSX_IMPL_1RG(vec_int4,  wa, vec_double2, wd, xvcvdpsxws, vec_cts)
+VSX_IMPL_1RG(vec_int4,  wa, vec_float4,  wf, xvcvspsxws, vec_cts)
+// converts single and double precision to unsigned word
+#undef vec_ctu
+VSX_IMPL_1RG(vec_uint4, wa, vec_double2, wd, xvcvdpuxws, vec_ctu)
+VSX_IMPL_1RG(vec_uint4, wa, vec_float4,  wf, xvcvspuxws, vec_ctu)
+// converts single and double precision to signed doubleword
+#ifdef vec_ctsl
+#   undef vec_ctsl
 #endif
+VSX_IMPL_1RG(vec_dword2, wi, vec_double2, wd, xvcvdpsxds, vec_ctsl)
+VSX_IMPL_1RG(vec_dword2, wi, vec_float4,  wf, xvcvspsxds, vec_ctslo)
-// shift left double by word immediate
+FORCE_INLINE(vec_dword2) vec_ctsl(const vec_float4& a)
-#ifndef vec_sldw
+{ return vec_ctslo(vec_sldw(a, a, 1)); }
-#   define vec_sldw __builtin_vsx_xxsldwi
+// converts single and double precision to unsigned doubleword
+#ifdef vec_ctul
+#   undef vec_ctul
 #endif
+VSX_IMPL_1RG(vec_udword2, wi, vec_double2, wd, xvcvdpuxds, vec_ctul)
+VSX_IMPL_1RG(vec_udword2, wi, vec_float4,  wf, xvcvspuxds, vec_ctulo)
+FORCE_INLINE(vec_udword2) vec_ctul(const vec_float4& a)
+{ return vec_ctulo(vec_sldw(a, a, 1)); }
 // just in case if GCC doesn't define it
 #ifndef vec_xl
@@ -327,8 +370,13 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
 * Also there's already an open bug https://bugs.llvm.org/show_bug.cgi?id=31837
 *
 * So we're not able to use inline asm and only use built-in functions that CLANG supports
+ * and use __builtin_convertvector if clang missng any of vector conversions built-in functions
 */
+// convert vector helper
+#define VSX_IMPL_CONVERT(rt, rg, fnm) \
+FORCE_INLINE(rt) fnm(const rg& a) { return __builtin_convertvector(a, rt); }
 #if __clang_major__ < 5
 // implement vec_permi in a dirty way
 #   define VSX_IMPL_CLANG_4_PERMI(Tvec)                                                 \
@@ -362,26 +410,6 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
 #   define vec_sldw vec_xxsldwi
 #endif
-/* converts between single and double precision */
-#ifndef vec_cvf
-    VSX_REDIRECT_1RG(vec_float4, vec_double2, vec_cvf, __builtin_vsx_xvcvdpsp)
-    FORCE_INLINE(vec_double2) vec_cvf(const vec_float4& a)
-    { return __builtin_vsx_xvcvspdp(vec_sld(a, a, 4)); }
-#endif
-/* converts 32 and 64 bit integers to double-precision */
-#ifndef vec_ctd
-#   define vec_ctd(a, b) __vec_ctd(a)
-    VSX_REDIRECT_1RG(vec_double2, vec_int4, __vec_ctd, __builtin_vsx_xvcvsxwdp)
-    VSX_REDIRECT_1RG(vec_double2, vec_uint4, __vec_ctd, __builtin_vsx_xvcvuxwdp)
-    // implement vec_ctd for double word in a dirty way since we are missing builtin xvcvsxddp, xvcvuxddp
-    // please try to avoid using it for double words
-    FORCE_INLINE(vec_double2) __vec_ctd(const vec_dword2& a)
-    { return vec_double2_set((double)vec_extract(a, 0), (double)vec_extract(a, 1)); }
-    FORCE_INLINE(vec_double2) __vec_ctd(const vec_udword2& a)
-    { return vec_double2_set((double)vec_extract(a, 0), (double)vec_extract(a, 1)); }
-#endif
 // Implement vec_rsqrt since clang only supports vec_rsqrte
 #ifndef vec_rsqrt
    FORCE_INLINE(vec_float4) vec_rsqrt(const vec_float4& a)
@@ -391,27 +419,157 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
    { return vec_div(vec_double2_sp(1), vec_sqrt(a)); }
 #endif
+// vec_popcnt should return unsigned but clang has different thought just like gcc in vec_vpopcnt
+#define VSX_IMPL_POPCNTU(Tvec, Tvec2, ucast)   \
+FORCE_INLINE(Tvec) vec_popcntu(const Tvec2& a) \
+{ return ucast(vec_popcnt(a)); }
+VSX_IMPL_POPCNTU(vec_uchar16, vec_char16, vec_uchar16_c);
+VSX_IMPL_POPCNTU(vec_ushort8, vec_short8, vec_ushort8_c);
+VSX_IMPL_POPCNTU(vec_uint4,   vec_int4,   vec_uint4_c);
+// redirect unsigned types
+VSX_REDIRECT_1RG(vec_uchar16, vec_uchar16, vec_popcntu, vec_popcnt)
+VSX_REDIRECT_1RG(vec_ushort8, vec_ushort8, vec_popcntu, vec_popcnt)
+VSX_REDIRECT_1RG(vec_uint4,   vec_uint4,   vec_popcntu, vec_popcnt)
+// converts between single and double precision
+#ifdef vec_cvf
+#   undef vec_cvf
+#endif
+VSX_REDIRECT_1RG(vec_float4,  vec_double2, vec_cvf,  __builtin_vsx_xvcvdpsp)
+VSX_REDIRECT_1RG(vec_double2, vec_float4,  vec_cvfo, __builtin_vsx_xvcvspdp)
-/*
+FORCE_INLINE(vec_double2) vec_cvf(const vec_float4& a)
- * __builtin_altivec_vctsxs in clang 5 and 6 causes ambiguous which used by vec_cts
+{ return vec_cvfo(vec_sldw(a, a, 1)); }
- * so we just redefine it and cast it
-*/
+// converts word and doubleword to double-precision
+#ifdef vec_ctd
+#   undef vec_ctd
+#endif
+VSX_REDIRECT_1RG(vec_double2, vec_int4,  vec_ctdo, __builtin_vsx_xvcvsxwdp)
+VSX_REDIRECT_1RG(vec_double2, vec_uint4, vec_ctdo, __builtin_vsx_xvcvuxwdp)
+VSX_IMPL_CONVERT(vec_double2, vec_dword2,  vec_ctd)
+VSX_IMPL_CONVERT(vec_double2, vec_udword2, vec_ctd)
+FORCE_INLINE(vec_double2) vec_ctd(const vec_int4& a)
+{ return vec_ctdo(vec_sldw(a, a, 1)); }
+FORCE_INLINE(vec_double2) vec_ctd(const vec_uint4& a)
+{ return vec_ctdo(vec_sldw(a, a, 1)); }
+// converts word and doubleword to single-precision
+#if __clang_major__ > 4
+#   undef vec_ctf
+#endif
+VSX_IMPL_CONVERT(vec_float4, vec_int4,    vec_ctf)
+VSX_IMPL_CONVERT(vec_float4, vec_uint4,   vec_ctf)
+VSX_REDIRECT_1RG(vec_float4, vec_dword2,  vec_ctf, __builtin_vsx_xvcvsxdsp)
+VSX_REDIRECT_1RG(vec_float4, vec_udword2, vec_ctf, __builtin_vsx_xvcvuxdsp)
+// converts single and double precision to signed word
 #if __clang_major__ > 4
 #   undef vec_cts
-#   define vec_cts(__a, __b)                                                            \
+#endif
-        _Generic((__a), vector float                                                    \
+VSX_REDIRECT_1RG(vec_int4,  vec_double2, vec_cts, __builtin_vsx_xvcvdpsxws)
-           : (vector signed int)__builtin_altivec_vctsxs((__a), (__b)), vector double   \
+VSX_IMPL_CONVERT(vec_int4,  vec_float4,  vec_cts)
-           : __extension__({                                                            \
-             vector double __ret =                                                      \
+// converts single and double precision to unsigned word
-                 (__a) *                                                                \
+#if __clang_major__ > 4
-                 (vector double)(vector unsigned long long)((0x3ffULL + (__b))          \
+#   undef vec_ctu
-                                                            << 52);                     \
+#endif
-             __builtin_convertvector(__ret, vector signed long long);                   \
+VSX_REDIRECT_1RG(vec_uint4, vec_double2, vec_ctu, __builtin_vsx_xvcvdpuxws)
-           }))
+VSX_IMPL_CONVERT(vec_uint4, vec_float4,  vec_ctu)
-#endif // __clang_major__ > 4
+// converts single and double precision to signed doubleword
+#ifdef vec_ctsl
+#   undef vec_ctsl
+#endif
+VSX_IMPL_CONVERT(vec_dword2, vec_double2, vec_ctsl)
+// __builtin_convertvector unable to convert, xvcvspsxds is missing on it
+FORCE_INLINE(vec_dword2) vec_ctslo(const vec_float4& a)
+{ return vec_ctsl(vec_cvfo(a)); }
+FORCE_INLINE(vec_dword2) vec_ctsl(const vec_float4& a)
+{ return vec_ctsl(vec_cvf(a)); }
+// converts single and double precision to unsigned doubleword
+#ifdef vec_ctul
+#   undef vec_ctul
+#endif
+VSX_IMPL_CONVERT(vec_udword2, vec_double2, vec_ctul)
+// __builtin_convertvector unable to convert, xvcvspuxds is missing on it
+FORCE_INLINE(vec_udword2) vec_ctulo(const vec_float4& a)
+{ return vec_ctul(vec_cvfo(a)); }
+FORCE_INLINE(vec_udword2) vec_ctul(const vec_float4& a)
+{ return vec_ctul(vec_cvf(a)); }
 #endif // CLANG VSX compatibility
+/*
+ * XLC VSX compatibility
+**/
+#if defined(__IBMCPP__)
+// vector population count
+#define vec_popcntu vec_popcnt
+// overload and redirect wih setting second arg to zero
+// since we only support conversions without the second arg
+#define VSX_IMPL_OVERLOAD_Z2(rt, rg, fnm) \
+FORCE_INLINE(rt) fnm(const rg& a) { return fnm(a, 0); }
+VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_int4,    vec_ctd)
+VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_uint4,   vec_ctd)
+VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_dword2,  vec_ctd)
+VSX_IMPL_OVERLOAD_Z2(vec_double2, vec_udword2, vec_ctd)
+VSX_IMPL_OVERLOAD_Z2(vec_float4,  vec_int4,    vec_ctf)
+VSX_IMPL_OVERLOAD_Z2(vec_float4,  vec_uint4,   vec_ctf)
+VSX_IMPL_OVERLOAD_Z2(vec_float4,  vec_dword2,  vec_ctf)
+VSX_IMPL_OVERLOAD_Z2(vec_float4,  vec_udword2, vec_ctf)
+VSX_IMPL_OVERLOAD_Z2(vec_int4,    vec_double2, vec_cts)
+VSX_IMPL_OVERLOAD_Z2(vec_int4,    vec_float4,  vec_cts)
+VSX_IMPL_OVERLOAD_Z2(vec_uint4,   vec_double2, vec_ctu)
+VSX_IMPL_OVERLOAD_Z2(vec_uint4,   vec_float4,  vec_ctu)
+VSX_IMPL_OVERLOAD_Z2(vec_dword2,  vec_double2, vec_ctsl)
+VSX_IMPL_OVERLOAD_Z2(vec_dword2,  vec_float4,  vec_ctsl)
+VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_double2, vec_ctul)
+VSX_IMPL_OVERLOAD_Z2(vec_udword2, vec_float4,  vec_ctul)
+// fixme: implement conversions of odd-numbered elements in a dirty way
+// since xlc doesn't support VSX registers operand in inline asm.
+#define VSX_IMPL_DIRTY_ODD(rt, rg, fnm, fn2) \
+FORCE_INLINE(rt) fnm(const rg& a) { return fn2(vec_sldw(a, a, 3)); }
+VSX_IMPL_DIRTY_ODD(vec_double2, vec_float4, vec_cvfo,  vec_cvf)
+VSX_IMPL_DIRTY_ODD(vec_double2, vec_int4,   vec_ctdo,  vec_ctd)
+VSX_IMPL_DIRTY_ODD(vec_double2, vec_uint4,  vec_ctdo,  vec_ctd)
+VSX_IMPL_DIRTY_ODD(vec_dword2,  vec_float4, vec_ctslo, vec_ctsl)
+VSX_IMPL_DIRTY_ODD(vec_udword2, vec_float4, vec_ctulo, vec_ctul)
+#endif // XLC VSX compatibility
+// ignore GCC warning that casued by -Wunused-but-set-variable in rare cases
+#if defined(__GNUG__) && !defined(__clang__)
+#   define VSX_UNUSED(Tvec) Tvec __attribute__((__unused__))
+#else // CLANG, XLC
+#   define VSX_UNUSED(Tvec) Tvec
+#endif
+// gcc can find his way in casting log int and XLC, CLANG ambiguous
+#if defined(__clang__) || defined(__IBMCPP__)
+    FORCE_INLINE(vec_udword2) vec_splats(uint64 v)
+    { return vec_splats((unsigned long long) v); }
+    FORCE_INLINE(vec_dword2) vec_splats(int64 v)
+    { return vec_splats((long long) v); }
+#endif
 /*
 * implement vsx_ld(offset, pointer), vsx_st(vector, offset, pointer)
 * load and set using offset depend on the pointer type
@@ -468,75 +626,6 @@ VSX_IMPL_2VRG_F(vec_uint4, vec_udword2, "vpkudus %0,%2,%1", vec_packs)
    { vsx_stf(vec, VSX_OFFSET(o, p), (long long*)p); }
 #endif
-#if defined(__clang__) || defined(__IBMCPP__)
-    // gcc can find his way in casting log int and XLC, CLANG ambiguous
-    FORCE_INLINE(vec_udword2) vec_splats(uint64 v)
-    { return vec_splats((unsigned long long) v); }
-    FORCE_INLINE(vec_dword2) vec_splats(int64 v)
-    { return vec_splats((long long) v); }
-#endif
-// Implement store vector bool char for XLC
-#if defined(__IBMCPP__) && defined(__clang__)
-    FORCE_INLINE(void) vec_xst(const vec_bchar16 &vec, long o, uchar* p)
-    { vec_xst(vec_uchar16_c(vec), VSX_OFFSET(o, p), p); }
-#endif
-// Working around vec_popcnt compatibility
-/*
- * vec_popcnt should return unsigned but clang has different thought just like gcc in vec_vpopcnt
- *
- * use vec_popcntu instead to deal with it
-*/
-#if defined(__clang__) && !defined(__IBMCPP__)
-#   define VSX_IMPL_CLANG_POPCNTU(Tvec, Tvec2, ucast)   \
-    FORCE_INLINE(Tvec) vec_popcntu(const Tvec2& a)      \
-    { return ucast(vec_popcnt(a)); }
-    VSX_IMPL_CLANG_POPCNTU(vec_uchar16, vec_char16, vec_uchar16_c);
-    VSX_IMPL_CLANG_POPCNTU(vec_ushort8, vec_short8, vec_ushort8_c);
-    VSX_IMPL_CLANG_POPCNTU(vec_uint4, vec_int4, vec_uint4_c);
-    // redirect unsigned types
-    VSX_REDIRECT_1RG(vec_uchar16, vec_uchar16, vec_popcntu, vec_popcnt)
-    VSX_REDIRECT_1RG(vec_ushort8, vec_ushort8, vec_popcntu, vec_popcnt)
-    VSX_REDIRECT_1RG(vec_uint4, vec_uint4, vec_popcntu, vec_popcnt)
-#else
-#   define vec_popcntu vec_popcnt
-#endif
-// Working around vec_cts compatibility
-/*
- * vec_cts in gcc and clang converts single-precision to signed fixed-point word
- * and from double-precision to signed doubleword, also there's no implement for vec_ctsl
- *
- * vec_cts in xlc converts single and double precision to signed fixed-point word
- * and xlc has vec_ctsl which converts single and double precision to signed doubleword
- *
- * so to deal with this situation, use vec_cts only if you want to convert single-precision to signed fixed-point word
- * and use vec_ctsl when you want to convert double-precision to signed doubleword
- *
- * Also we implemented vec_ctsw(a) to convert double-precision to signed fixed-point word
-*/
-// converts double-precision to signed doubleword for GCC and CLANG
-#if !defined(vec_ctsl) && !defined(__IBMCPP__) && (defined(__GNUG__) || defined(__clang__))
-// GCC4 has incorrect results in convert to signed doubleword
-#   if !defined(__clang__) && __GNUG__ < 5
-#       define vec_ctsl(a, b) __vec_ctsl(a)
-        VSX_IMPL_1RG(vec_dword2, wi, vec_double2, wd, xvcvdpsxds, __vec_ctsl)
-#   else // GCC > 4 , CLANG
-#       define vec_ctsl vec_cts
-#   endif
-#endif
-// converts double-precision to signed fixed-point word
-#if defined(__IBMCPP__)
-#   define vec_ctsw(a) vec_cts(a, 0)
-#else // GCC, CLANG
-#   define vec_ctsw(a) vec_int4_c(__builtin_vsx_xvcvdpsxws(a))
-#endif
 // load 4 unsigned bytes into uint4 vector
 #define vec_ld_buw(p) vec_uint4_set((p)[0], (p)[1], (p)[2], (p)[3])
@@ -566,14 +655,14 @@ FORCE_INLINE(Tvec) vec_ldz_l8(const Tp *p)
    return vec_and(vec_ld_l8(p), (Tvec)mask);                                   \
 }
 VSX_IMPL_LOAD_L8(vec_uchar16, uchar)
-VSX_IMPL_LOAD_L8(vec_char16, schar)
+VSX_IMPL_LOAD_L8(vec_char16,  schar)
 VSX_IMPL_LOAD_L8(vec_ushort8, ushort)
-VSX_IMPL_LOAD_L8(vec_short8, short)
+VSX_IMPL_LOAD_L8(vec_short8,  short)
-VSX_IMPL_LOAD_L8(vec_uint4, uint)
+VSX_IMPL_LOAD_L8(vec_uint4,   uint)
-VSX_IMPL_LOAD_L8(vec_int4, int)
+VSX_IMPL_LOAD_L8(vec_int4,    int)
-VSX_IMPL_LOAD_L8(vec_float4, float)
+VSX_IMPL_LOAD_L8(vec_float4,  float)
 VSX_IMPL_LOAD_L8(vec_udword2, uint64)
-VSX_IMPL_LOAD_L8(vec_dword2, int64)
+VSX_IMPL_LOAD_L8(vec_dword2,  int64)
 VSX_IMPL_LOAD_L8(vec_double2, double)
 // logical not
@@ -601,41 +690,45 @@ FORCE_INLINE(rt) vec_unpackhu(const rg& a)              \
 { return reinterpret_cast<rt>(vec_mergeh(a, zero));  }
 VSX_IMPL_UNPACKU(vec_ushort8, vec_uchar16, vec_uchar16_z)
-VSX_IMPL_UNPACKU(vec_uint4, vec_ushort8, vec_ushort8_z)
+VSX_IMPL_UNPACKU(vec_uint4,   vec_ushort8, vec_ushort8_z)
-VSX_IMPL_UNPACKU(vec_udword2, vec_uint4, vec_uint4_z)
+VSX_IMPL_UNPACKU(vec_udword2, vec_uint4,   vec_uint4_z)
 /*
 * Implement vec_mergesqe and vec_mergesqo
 * Merges the sequence values of even and odd elements of two vectors
 */
+#define VSX_IMPL_PERM(rt, fnm, ...)            \
+FORCE_INLINE(rt) fnm(const rt& a, const rt& b) \
+{ static const vec_uchar16 perm = {__VA_ARGS__}; return vec_perm(a, b, perm); }
 // 16
 #define perm16_mergesqe 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
 #define perm16_mergesqo 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
 VSX_IMPL_PERM(vec_uchar16, vec_mergesqe, perm16_mergesqe)
 VSX_IMPL_PERM(vec_uchar16, vec_mergesqo, perm16_mergesqo)
-VSX_IMPL_PERM(vec_char16, vec_mergesqe, perm16_mergesqe)
+VSX_IMPL_PERM(vec_char16,  vec_mergesqe, perm16_mergesqe)
-VSX_IMPL_PERM(vec_char16, vec_mergesqo, perm16_mergesqo)
+VSX_IMPL_PERM(vec_char16,  vec_mergesqo, perm16_mergesqo)
 // 8
 #define perm8_mergesqe 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29
 #define perm8_mergesqo 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31
 VSX_IMPL_PERM(vec_ushort8, vec_mergesqe, perm8_mergesqe)
 VSX_IMPL_PERM(vec_ushort8, vec_mergesqo, perm8_mergesqo)
-VSX_IMPL_PERM(vec_short8, vec_mergesqe, perm8_mergesqe)
+VSX_IMPL_PERM(vec_short8,  vec_mergesqe, perm8_mergesqe)
-VSX_IMPL_PERM(vec_short8, vec_mergesqo, perm8_mergesqo)
+VSX_IMPL_PERM(vec_short8,  vec_mergesqo, perm8_mergesqo)
 // 4
 #define perm4_mergesqe 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
 #define perm4_mergesqo 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
-VSX_IMPL_PERM(vec_uint4, vec_mergesqe, perm4_mergesqe)
+VSX_IMPL_PERM(vec_uint4,  vec_mergesqe, perm4_mergesqe)
-VSX_IMPL_PERM(vec_uint4, vec_mergesqo, perm4_mergesqo)
+VSX_IMPL_PERM(vec_uint4,  vec_mergesqo, perm4_mergesqo)
-VSX_IMPL_PERM(vec_int4, vec_mergesqe, perm4_mergesqe)
+VSX_IMPL_PERM(vec_int4,   vec_mergesqe, perm4_mergesqe)
-VSX_IMPL_PERM(vec_int4, vec_mergesqo, perm4_mergesqo)
+VSX_IMPL_PERM(vec_int4,   vec_mergesqo, perm4_mergesqo)
 VSX_IMPL_PERM(vec_float4, vec_mergesqe, perm4_mergesqe)
 VSX_IMPL_PERM(vec_float4, vec_mergesqo, perm4_mergesqo)
 // 2
 VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqe, vec_mergeh)
 VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqo, vec_mergel)
-VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqe, vec_mergeh)
+VSX_REDIRECT_2RG(vec_dword2,  vec_dword2,  vec_mergesqe, vec_mergeh)
-VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqo, vec_mergel)
+VSX_REDIRECT_2RG(vec_dword2,  vec_dword2,  vec_mergesqo, vec_mergel)
 VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqe, vec_mergeh)
 VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqo, vec_mergel)
@@ -657,8 +750,8 @@ VSX_IMPL_MERGESQHL(vec_int4)
 VSX_IMPL_MERGESQHL(vec_float4)
 VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesqh, vec_mergeh)
 VSX_REDIRECT_2RG(vec_udword2, vec_udword2, vec_mergesql, vec_mergel)
-VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesqh, vec_mergeh)
+VSX_REDIRECT_2RG(vec_dword2,  vec_dword2,  vec_mergesqh, vec_mergeh)
-VSX_REDIRECT_2RG(vec_dword2, vec_dword2, vec_mergesql, vec_mergel)
+VSX_REDIRECT_2RG(vec_dword2,  vec_dword2,  vec_mergesql, vec_mergel)
 VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesqh, vec_mergeh)
 VSX_REDIRECT_2RG(vec_double2, vec_double2, vec_mergesql, vec_mergel)
@@ -682,13 +775,13 @@ FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,          \
    vsx_stf(vec_mergeh(ac, bd), 32, ptr);                                   \
    vsx_stf(vec_mergel(ac, bd), 48, ptr);                                   \
 }
-VSX_IMPL_ST_INTERLEAVE(uchar, vec_uchar16)
+VSX_IMPL_ST_INTERLEAVE(uchar,  vec_uchar16)
-VSX_IMPL_ST_INTERLEAVE(schar, vec_char16)
+VSX_IMPL_ST_INTERLEAVE(schar,  vec_char16)
 VSX_IMPL_ST_INTERLEAVE(ushort, vec_ushort8)
-VSX_IMPL_ST_INTERLEAVE(short, vec_short8)
+VSX_IMPL_ST_INTERLEAVE(short,  vec_short8)
-VSX_IMPL_ST_INTERLEAVE(uint, vec_uint4)
+VSX_IMPL_ST_INTERLEAVE(uint,   vec_uint4)
-VSX_IMPL_ST_INTERLEAVE(int, vec_int4)
+VSX_IMPL_ST_INTERLEAVE(int,    vec_int4)
-VSX_IMPL_ST_INTERLEAVE(float, vec_float4)
+VSX_IMPL_ST_INTERLEAVE(float,  vec_float4)
 // 2 and 4 channels deinterleave for 16 lanes
 #define VSX_IMPL_ST_DINTERLEAVE_8(Tp, Tvec)                                 \
@@ -748,7 +841,7 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b,     \
    d = vec_mergesql(cd0, cd1);                                             \
 }
 VSX_IMPL_ST_DINTERLEAVE_16(ushort, vec_ushort8)
-VSX_IMPL_ST_DINTERLEAVE_16(short, vec_short8)
+VSX_IMPL_ST_DINTERLEAVE_16(short,  vec_short8)
 // 2 and 4 channels deinterleave for 4 lanes
 #define VSX_IMPL_ST_DINTERLEAVE_32(Tp, Tvec)                                \
@@ -777,8 +870,8 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b,     \
    c = vec_mergeh(m0, m1);                                                 \
    d = vec_mergel(m0, m1);                                                 \
 }
-VSX_IMPL_ST_DINTERLEAVE_32(uint, vec_uint4)
+VSX_IMPL_ST_DINTERLEAVE_32(uint,  vec_uint4)
-VSX_IMPL_ST_DINTERLEAVE_32(int, vec_int4)
+VSX_IMPL_ST_DINTERLEAVE_32(int,   vec_int4)
 VSX_IMPL_ST_DINTERLEAVE_32(float, vec_float4)
 // 2 and 4 channels interleave and deinterleave for 2 lanes
@@ -815,9 +908,9 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b,     \
    c = vec_mergeh(v0, v1);                                                 \
    d = vec_mergel(v0, v1);                                                 \
 }
-VSX_IMPL_ST_D_INTERLEAVE_64(int64, vec_dword2, vsx_ld2, vsx_st2)
+VSX_IMPL_ST_D_INTERLEAVE_64(int64,  vec_dword2,  vsx_ld2, vsx_st2)
 VSX_IMPL_ST_D_INTERLEAVE_64(uint64, vec_udword2, vsx_ld2, vsx_st2)
-VSX_IMPL_ST_D_INTERLEAVE_64(double, vec_double2, vsx_ld, vsx_st)
+VSX_IMPL_ST_D_INTERLEAVE_64(double, vec_double2, vsx_ld,  vsx_st)
 /* 3 channels */
 #define VSX_IMPL_ST_INTERLEAVE_3CH_16(Tp, Tvec)                                                   \
@@ -882,7 +975,7 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c)
    c = vec_perm(vec_perm(v1, v2, c12_perm), v3, c123_perm);                                      \
 }
 VSX_IMPL_ST_INTERLEAVE_3CH_8(ushort, vec_ushort8)
-VSX_IMPL_ST_INTERLEAVE_3CH_8(short, vec_short8)
+VSX_IMPL_ST_INTERLEAVE_3CH_8(short,  vec_short8)
 #define VSX_IMPL_ST_INTERLEAVE_3CH_4(Tp, Tvec)                                                     \
 FORCE_INLINE(void) vec_st_interleave(const Tvec& a, const Tvec& b,                                 \
@@ -907,8 +1000,8 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a, Tvec& b, Tvec& c)
    b = vec_perm(v2, vec_sld(v1, v3, 8), flp2);                                                    \
    c = vec_perm(vec_sld(v2, v1, 8), v3, flp);                                                     \
 }
-VSX_IMPL_ST_INTERLEAVE_3CH_4(uint, vec_uint4)
+VSX_IMPL_ST_INTERLEAVE_3CH_4(uint,  vec_uint4)
-VSX_IMPL_ST_INTERLEAVE_3CH_4(int, vec_int4)
+VSX_IMPL_ST_INTERLEAVE_3CH_4(int,   vec_int4)
 VSX_IMPL_ST_INTERLEAVE_3CH_4(float, vec_float4)
 #define VSX_IMPL_ST_INTERLEAVE_3CH_2(Tp, Tvec, ld_func, st_func)     \
@@ -929,9 +1022,9 @@ FORCE_INLINE(void) vec_ld_deinterleave(const Tp* ptr, Tvec& a,       \
    b = vec_permi(v1, v3, 2);                                        \
    c = vec_permi(v2, v3, 1);                                        \
 }
-VSX_IMPL_ST_INTERLEAVE_3CH_2(int64, vec_dword2, vsx_ld2, vsx_st2)
+VSX_IMPL_ST_INTERLEAVE_3CH_2(int64,  vec_dword2,  vsx_ld2, vsx_st2)
 VSX_IMPL_ST_INTERLEAVE_3CH_2(uint64, vec_udword2, vsx_ld2, vsx_st2)
-VSX_IMPL_ST_INTERLEAVE_3CH_2(double, vec_double2, vsx_ld, vsx_st)
+VSX_IMPL_ST_INTERLEAVE_3CH_2(double, vec_double2, vsx_ld,  vsx_st)
 #endif // CV_VSX

--- a/modules/core/src/rand.cpp
+++ b/modules/core/src/rand.cpp
@@ -74,6 +74,12 @@ namespace cv
 #define  RNG_NEXT(x)    ((uint64)(unsigned)(x)*CV_RNG_COEFF + ((x) >> 32))
+#ifdef __PPC64__
+    #define PPC_MUL_ADD(ret, tmp, p0, p1)                           \
+    asm volatile("fmuls %0,%1,%2\n\t fadds %0,%0,%3" : "=&f" (ret)  \
+                : "f" (tmp), "f" (p0), "f" (p1))
+#endif
 /***************************************************************************************\
 *                           Pseudo-Random Number Generators (PRNGs)                     *
 \***************************************************************************************/
@@ -248,6 +254,14 @@ static void randf_32f( float* arr, int len, uint64* state, const Vec2f* p, bool
        volatile float32x4_t v0 = vmulq_f32(vld1q_f32(f), p0);
        vst1q_f32(arr+i, vaddq_f32(v0, p1));
+#elif defined __PPC64__
+        // inline asm is required for numerical stability!
+        // compilers tends to use floating multiply-add single(fmadds)
+        // instead of separate multiply and add
+        PPC_MUL_ADD(arr[i+0], f[0], p[i+0][0], p[i+0][1]);
+        PPC_MUL_ADD(arr[i+1], f[1], p[i+1][0], p[i+1][1]);
+        PPC_MUL_ADD(arr[i+2], f[2], p[i+2][0], p[i+2][1]);
+        PPC_MUL_ADD(arr[i+3], f[3], p[i+3][0], p[i+3][1]);
 #else
        arr[i+0] = f[0]*p[i+0][0] + p[i+0][1];
        arr[i+1] = f[1]*p[i+1][0] + p[i+1][1];
@@ -269,6 +283,8 @@ static void randf_32f( float* arr, int len, uint64* state, const Vec2f* p, bool
                vdup_n_f32((float)(int)temp), vdup_n_f32(p[i][0])),
                vdup_n_f32(p[i][1]));
        arr[i] = vget_lane_f32(t, 0);
+#elif defined __PPC64__
+        PPC_MUL_ADD(arr[i], (float)(int)temp, p[i][0], p[i][1]);
 #else
        arr[i] = (int)temp*p[i][0] + p[i][1];
 #endif

--- a/platforms/linux/ppc64-gnu.toolchain.cmake
+++ b/platforms/linux/ppc64-gnu.toolchain.cmake
+set(CMAKE_SYSTEM_PROCESSOR ppc64)
+set(GNU_MACHINE "powerpc64-linux-gnu" CACHE STRING "GNU compiler triple")
+include("${CMAKE_CURRENT_LIST_DIR}/ppcat.toolchain.cmake")
--- a/platforms/linux/ppc64le-gnu.toolchain.cmake
+++ b/platforms/linux/ppc64le-gnu.toolchain.cmake
+set(CMAKE_SYSTEM_PROCESSOR ppc64le)
+set(GNU_MACHINE "powerpc64le-linux-gnu" CACHE STRING "GNU compiler triple")
+include("${CMAKE_CURRENT_LIST_DIR}/ppcat.toolchain.cmake")
--- a/platforms/linux/ppcat.toolchain.cmake
+++ b/platforms/linux/ppcat.toolchain.cmake
+if(COMMAND toolchain_save_config)
+  return() # prevent recursive call
+endif()
+option(AT_PATH        "Advance Toolchain directory" "")
+option(AT_RPATH       "Add new directories to runtime search path" "")
+option(AT_HOST_LINK   "Enable/disable Link against host advance toolchain runtime" OFF)
+option(AT_NO_AUTOVEC  "Disable/enable Auto Vectorizer optimization" OFF)
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_VERSION 1)
+include("${CMAKE_CURRENT_LIST_DIR}/gnu.toolchain.cmake")
+if(NOT DEFINED CMAKE_C_COMPILER)
+  string(REGEX REPLACE "/+$" "" AT_PATH "${AT_PATH}")
+  if(NOT AT_PATH)
+    message(FATAL_ERROR "'AT_PATH' option is required. Please set it to Advance Toolchain path to get toolchain works")
+  endif()
+  if(NOT EXISTS ${AT_PATH})
+    message(FATAL_ERROR "'${AT_PATH}' Advance Toolchain path isn't exist")
+  endif()
+  set(CMAKE_C_COMPILER "${AT_PATH}/bin/${GNU_MACHINE}-gcc")
+  if(NOT EXISTS ${CMAKE_C_COMPILER})
+    message(FATAL_ERROR "GNU C compiler isn't exist on path '${CMAKE_C_COMPILER}'. Please install Advance Toolchain with ${CMAKE_SYSTEM_PROCESSOR} supports")
+  endif()
+endif()
+if(NOT DEFINED CMAKE_CXX_COMPILER)
+  set(CMAKE_CXX_COMPILER "${AT_PATH}/bin/${GNU_MACHINE}-g++")
+  if(NOT EXISTS ${CMAKE_CXX_COMPILER})
+    message(FATAL_ERROR "GNU C++ compiler isn't exist. Invalid install of Advance Toolchain")
+  endif()
+endif()
+if(NOT DEFINED AT_GCCROOT_PATH)
+  set(AT_GCCROOT_PATH "${AT_PATH}/${GNU_MACHINE}")
+  if(NOT EXISTS ${AT_GCCROOT_PATH})
+    message(FATAL_ERROR "GCC root path '${AT_GCCROOT_PATH}' isn't exist. Invalid install of Advance Toolchain")
+  endif()
+endif()
+if(NOT DEFINED AT_SYSROOT_PATH)
+  if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "ppc64")
+    set(AT_SYSROOT_PATH "${AT_PATH}/ppc")
+  else()
+    set(AT_SYSROOT_PATH "${AT_PATH}/${CMAKE_SYSTEM_PROCESSOR}")
+  endif()
+  if(NOT EXISTS ${AT_SYSROOT_PATH})
+    message(FATAL_ERROR "System root path '${AT_SYSROOT_PATH}' isn't exist. Invalid install of Advance Toolchain")
+  endif()
+endif()
+if(NOT DEFINED CMAKE_EXE_LINKER_FLAGS)
+  set(CMAKE_CXX_FLAGS           "" CACHE INTERAL "")
+  set(CMAKE_C_FLAGS             "" CACHE INTERAL "")
+  set(CMAKE_EXE_LINKER_FLAGS    "" CACHE INTERAL "")
+  set(CMAKE_SHARED_LINKER_FLAGS "" CACHE INTERAL "")
+  set(CMAKE_MODULE_LINKER_FLAGS "" CACHE INTERAL "")
+  if(AT_RPATH)
+    string(REPLACE "," ";" RPATH_LIST ${AT_RPATH})
+  endif()
+  if(AT_HOST_LINK)
+    #get 64-bit dynamic linker path
+    file(STRINGS "${AT_SYSROOT_PATH}/usr/bin/ldd" RTLDLIST LIMIT_COUNT 1 REGEX "^RTLDLIST=[\"*\"]")
+    string(REGEX REPLACE "RTLDLIST=|\"" "" RTLDLIST "${RTLDLIST}")
+    string(REPLACE " " ";" RTLDLIST "${RTLDLIST}")
+    #RTLDLIST must contains 32 and 64 bit paths
+    list(LENGTH RTLDLIST RTLDLIST_LEN)
+    if(NOT RTLDLIST_LEN GREATER 1)
+      message(FATAL_ERROR "Could not fetch dynamic linker path. Invalid install of Advance Toolchain")
+    endif()
+    list (GET RTLDLIST 1 LINKER_PATH)
+    set(CMAKE_EXE_LINKER_FLAGS "-Wl,--dynamic-linker=${AT_SYSROOT_PATH}${LINKER_PATH}")
+    list(APPEND RPATH_LIST "${AT_GCCROOT_PATH}/lib64/")
+    list(APPEND RPATH_LIST "${AT_SYSROOT_PATH}/lib64/")
+    list(APPEND RPATH_LIST "${AT_SYSROOT_PATH}/usr/lib64/")
+    list(APPEND RPATH_LIST "${PROJECT_BINARY_DIR}/lib/")
+  endif()
+  list(LENGTH RPATH_LIST RPATH_LEN)
+  if(RPATH_LEN GREATER 0)
+    set(AT_LINKER_FLAGS "${AT_LINKER_FLAGS} -Wl")
+    foreach(RPATH ${RPATH_LIST})
+      set(AT_LINKER_FLAGS "${AT_LINKER_FLAGS},-rpath,${RPATH}")
+    endforeach()
+  endif()
+  set(CMAKE_SHARED_LINKER_FLAGS "${AT_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}")
+  set(CMAKE_MODULE_LINKER_FLAGS "${AT_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FLAGS}")
+  set(CMAKE_EXE_LINKER_FLAGS    "${AT_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}")
+  if(AT_NO_AUTOVEC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-tree-vectorize -fno-tree-slp-vectorize")
+    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -fno-tree-vectorize -fno-tree-slp-vectorize")
+  endif()
+endif()
+set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${AT_SYSROOT_PATH} ${AT_GCCROOT_PATH})
+set(CMAKE_SYSROOT ${AT_SYSROOT_PATH})
+# what about ld.gold?
+if(NOT DEFINED CMAKE_LINKER)
+  find_program(CMAKE_LINKER NAMES ld)
+endif()
+if(NOT DEFINED CMAKE_AR)
+  find_program(CMAKE_AR NAMES ar)
+endif()
+set(TOOLCHAIN_CONFIG_VARS ${TOOLCHAIN_CONFIG_VARS}
+    CMAKE_SYSROOT
+    AT_SYSROOT_PATH
+    AT_GCCROOT_PATH
+)
+toolchain_save_config()