Commit 751cee8e authored by Maksim Shabunin's avatar Maksim Shabunin

Merge pull request #9907 from seiko2plus:vsxFixesImproves

parents 0608227e 2dc76d50
......@@ -523,24 +523,25 @@ OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_add_wrap, vec_add)
OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_sub_wrap, vec_sub)
/** Bitwise shifts **/
#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpuvec, splfunc) \
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
{ return _Tpuvec(vec_sl(a.val, splfunc(imm))); } \
inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
{ return _Tpuvec(vec_sr(a.val, splfunc(imm))); } \
template<int imm> inline _Tpuvec v_shl(const _Tpuvec& a) \
{ return _Tpuvec(vec_sl(a.val, splfunc(imm))); } \
template<int imm> inline _Tpuvec v_shr(const _Tpuvec& a) \
{ return _Tpuvec(vec_sr(a.val, splfunc(imm))); }
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint8x16, vec_uchar16_sp)
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int8x16, vec_uchar16_sp)
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint16x8, vec_ushort8_sp)
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int16x8, vec_ushort8_sp)
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint32x4, vec_uint4_sp)
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int32x4, vec_uint4_sp)
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint64x2, vec_udword2_sp)
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_udword2_sp)
#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc) \
inline _Tpvec operator << (const _Tpvec& a, int imm) \
{ return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
inline _Tpvec operator >> (const _Tpvec& a, int imm) \
{ return _Tpvec(shr(a.val, splfunc(imm))); } \
template<int imm> inline _Tpvec v_shl(const _Tpvec& a) \
{ return _Tpvec(vec_sl(a.val, splfunc(imm))); } \
template<int imm> inline _Tpvec v_shr(const _Tpvec& a) \
{ return _Tpvec(shr(a.val, splfunc(imm))); }
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint8x16, vec_sr, vec_uchar16_sp)
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint16x8, vec_sr, vec_ushort8_sp)
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint32x4, vec_sr, vec_uint4_sp)
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_uint64x2, vec_sr, vec_udword2_sp)
// algebraic right shift
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int8x16, vec_sra, vec_uchar16_sp)
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int16x8, vec_sra, vec_ushort8_sp)
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int32x4, vec_sra, vec_uint4_sp)
OPENCV_HAL_IMPL_VSX_SHIFT_OP(v_int64x2, vec_sra, vec_udword2_sp)
/** Bitwise logic **/
#define OPENCV_HAL_IMPL_VSX_LOGIC_OP(_Tpvec) \
......@@ -605,6 +606,64 @@ OPENCV_HAL_IMPL_VSX_INT_CMP_OP(v_int64x2)
OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_min, vec_min)
OPENCV_HAL_IMPL_VSX_BIN_FUNC(v_max, vec_max)
/** Rotate **/
#define OPENCV_IMPL_VSX_ROTATE(_Tpvec, suffix, shf, cast) \
template<int imm> \
inline _Tpvec v_rotate_##suffix(const _Tpvec& a) \
{ \
const int wd = imm * sizeof(typename _Tpvec::lane_type); \
if (wd > 15) \
return _Tpvec(); \
return _Tpvec((cast)shf(vec_uchar16_c(a.val), vec_uchar16_sp(wd << 3))); \
}
#define OPENCV_IMPL_VSX_ROTATE_LR(_Tpvec, cast) \
OPENCV_IMPL_VSX_ROTATE(_Tpvec, left, vec_slo, cast) \
OPENCV_IMPL_VSX_ROTATE(_Tpvec, right, vec_sro, cast)
OPENCV_IMPL_VSX_ROTATE_LR(v_uint8x16, vec_uchar16)
OPENCV_IMPL_VSX_ROTATE_LR(v_int8x16, vec_char16)
OPENCV_IMPL_VSX_ROTATE_LR(v_uint16x8, vec_ushort8)
OPENCV_IMPL_VSX_ROTATE_LR(v_int16x8, vec_short8)
OPENCV_IMPL_VSX_ROTATE_LR(v_uint32x4, vec_uint4)
OPENCV_IMPL_VSX_ROTATE_LR(v_int32x4, vec_int4)
OPENCV_IMPL_VSX_ROTATE_LR(v_uint64x2, vec_udword2)
OPENCV_IMPL_VSX_ROTATE_LR(v_int64x2, vec_dword2)
template<int imm, typename _Tpvec>
inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b)
{
const int wd = imm * sizeof(typename _Tpvec::lane_type);
if (wd == 0)
return a;
return _Tpvec(vec_sld(b.val, a.val, 16 - wd));
}
template<int imm, typename _Tpvec>
inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b)
{
const int wd = imm * sizeof(typename _Tpvec::lane_type);
if (wd == 16)
return b;
return _Tpvec(vec_sld(a.val, b.val, wd));
}
#define OPENCV_IMPL_VSX_ROTATE_64(_Tpvec, suffix, rg1, rg2) \
template<int imm> \
inline _Tpvec v_rotate_##suffix(const _Tpvec& a, const _Tpvec& b) \
{ \
if (imm == 1) \
return _Tpvec(vec_permi(rg1.val, rg2.val, 2)); \
return imm ? b : a; \
}
OPENCV_IMPL_VSX_ROTATE_64(v_int64x2, right, a, b)
OPENCV_IMPL_VSX_ROTATE_64(v_uint64x2, right, a, b)
OPENCV_IMPL_VSX_ROTATE_64(v_int64x2, left, b, a)
OPENCV_IMPL_VSX_ROTATE_64(v_uint64x2, left, b, a)
////////// Reduce and mask /////////
/** Reduce **/
......@@ -726,7 +785,7 @@ inline int v_signmask(const v_float32x4& a)
inline int v_signmask(const v_int64x2& a)
{
const vec_dword2 sv = vec_sr(a.val, vec_udword2_sp(63));
VSX_UNUSED(const vec_dword2) sv = vec_sr(a.val, vec_udword2_sp(63));
return (int)vec_extract(sv, 0) | (int)vec_extract(sv, 1) << 1;
}
inline int v_signmask(const v_uint64x2& a)
......@@ -812,66 +871,47 @@ OPENCV_HAL_IMPL_VSX_BIN_FUNC2(v_int64x2, v_uint64x2, vec_udword2_c, v_absdiff, v
/** Rounding **/
inline v_int32x4 v_round(const v_float32x4& a)
{ return v_int32x4(vec_cts(vec_round(a.val), 0)); }
{ return v_int32x4(vec_cts(vec_round(a.val))); }
inline v_int32x4 v_round(const v_float64x2& a)
{
static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0};
return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(vec_round(a.val)), perm));
}
{ return v_int32x4(vec_mergesqo(vec_cts(vec_round(a.val)), vec_int4_z)); }
inline v_int32x4 v_floor(const v_float32x4& a)
{ return v_int32x4(vec_cts(vec_floor(a.val), 0)); }
{ return v_int32x4(vec_cts(vec_floor(a.val))); }
inline v_int32x4 v_floor(const v_float64x2& a)
{
static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0};
return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(vec_floor(a.val)), perm));
}
{ return v_int32x4(vec_mergesqo(vec_cts(vec_floor(a.val)), vec_int4_z)); }
inline v_int32x4 v_ceil(const v_float32x4& a)
{ return v_int32x4(vec_cts(vec_ceil(a.val), 0)); }
{ return v_int32x4(vec_cts(vec_ceil(a.val))); }
inline v_int32x4 v_ceil(const v_float64x2& a)
{
static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0};
return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(vec_ceil(a.val)), perm));
}
{ return v_int32x4(vec_mergesqo(vec_cts(vec_ceil(a.val)), vec_int4_z)); }
inline v_int32x4 v_trunc(const v_float32x4& a)
{ return v_int32x4(vec_cts(a.val, 0)); }
{ return v_int32x4(vec_cts(a.val)); }
inline v_int32x4 v_trunc(const v_float64x2& a)
{
static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0};
return v_int32x4(vec_perm(vec_int4_z, vec_ctsw(a.val), perm));
}
{ return v_int32x4(vec_mergesqo(vec_cts(a.val), vec_int4_z)); }
/** To float **/
inline v_float32x4 v_cvt_f32(const v_int32x4& a)
{ return v_float32x4(vec_ctf(a.val, 0)); }
{ return v_float32x4(vec_ctf(a.val)); }
inline v_float32x4 v_cvt_f32(const v_float64x2& a)
{
static const vec_uchar16 perm = {16, 17, 18, 19, 24, 25, 26, 27, 0, 0, 0, 0, 0, 0, 0, 0};
return v_float32x4(vec_perm(vec_float4_z, vec_cvf(a.val), perm));
}
{ return v_float32x4(vec_mergesqo(vec_cvf(a.val), vec_float4_z)); }
inline v_float64x2 v_cvt_f64(const v_int32x4& a)
{
return v_float64x2(vec_ctd(vec_mergeh(a.val, a.val), 0));
}
{ return v_float64x2(vec_ctdo(vec_mergeh(a.val, a.val))); }
inline v_float64x2 v_cvt_f64_high(const v_int32x4& a)
{
return v_float64x2(vec_ctd(vec_mergel(a.val, a.val), 0));
}
{ return v_float64x2(vec_ctdo(vec_mergel(a.val, a.val))); }
inline v_float64x2 v_cvt_f64(const v_float32x4& a)
{
return v_float64x2(vec_cvf(vec_mergeh(a.val, a.val)));
}
{ return v_float64x2(vec_cvfo(vec_mergeh(a.val, a.val))); }
inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
{
return v_float64x2(vec_cvf(vec_mergel(a.val, a.val)));
}
{ return v_float64x2(vec_cvfo(vec_mergel(a.val, a.val))); }
/** Reinterpret **/
/** its up there with load and store operations **/
......@@ -888,10 +928,20 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
const vec_float4 v0 = vec_splat(v.val, 0);
const vec_float4 v1 = vec_splat(v.val, 1);
const vec_float4 v2 = vec_splat(v.val, 2);
const vec_float4 v3 = vec_splat(v.val, 3);
VSX_UNUSED(const vec_float4) v3 = vec_splat(v.val, 3);
return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, vec_mul(v3, m3.val)))));
}
inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
const v_float32x4& m1, const v_float32x4& m2,
const v_float32x4& a)
{
const vec_float4 v0 = vec_splat(v.val, 0);
const vec_float4 v1 = vec_splat(v.val, 1);
const vec_float4 v2 = vec_splat(v.val, 2);
return v_float32x4(vec_madd(v0, m0.val, vec_madd(v1, m1.val, vec_madd(v2, m2.val, a.val))));
}
#define OPENCV_HAL_IMPL_VSX_TRANSPOSE4x4(_Tpvec, _Tpvec2) \
inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
const _Tpvec& a2, const _Tpvec& a3, \
......
......@@ -74,6 +74,12 @@ namespace cv
#define RNG_NEXT(x) ((uint64)(unsigned)(x)*CV_RNG_COEFF + ((x) >> 32))
#ifdef __PPC64__
#define PPC_MUL_ADD(ret, tmp, p0, p1) \
asm volatile("fmuls %0,%1,%2\n\t fadds %0,%0,%3" : "=&f" (ret) \
: "f" (tmp), "f" (p0), "f" (p1))
#endif
/***************************************************************************************\
* Pseudo-Random Number Generators (PRNGs) *
\***************************************************************************************/
......@@ -248,6 +254,14 @@ static void randf_32f( float* arr, int len, uint64* state, const Vec2f* p, bool
volatile float32x4_t v0 = vmulq_f32(vld1q_f32(f), p0);
vst1q_f32(arr+i, vaddq_f32(v0, p1));
#elif defined __PPC64__
// inline asm is required for numerical stability!
// compilers tends to use floating multiply-add single(fmadds)
// instead of separate multiply and add
PPC_MUL_ADD(arr[i+0], f[0], p[i+0][0], p[i+0][1]);
PPC_MUL_ADD(arr[i+1], f[1], p[i+1][0], p[i+1][1]);
PPC_MUL_ADD(arr[i+2], f[2], p[i+2][0], p[i+2][1]);
PPC_MUL_ADD(arr[i+3], f[3], p[i+3][0], p[i+3][1]);
#else
arr[i+0] = f[0]*p[i+0][0] + p[i+0][1];
arr[i+1] = f[1]*p[i+1][0] + p[i+1][1];
......@@ -269,6 +283,8 @@ static void randf_32f( float* arr, int len, uint64* state, const Vec2f* p, bool
vdup_n_f32((float)(int)temp), vdup_n_f32(p[i][0])),
vdup_n_f32(p[i][1]));
arr[i] = vget_lane_f32(t, 0);
#elif defined __PPC64__
PPC_MUL_ADD(arr[i], (float)(int)temp, p[i][0], p[i][1]);
#else
arr[i] = (int)temp*p[i][0] + p[i][1];
#endif
......
set(CMAKE_SYSTEM_PROCESSOR ppc64)
set(GNU_MACHINE "powerpc64-linux-gnu" CACHE STRING "GNU compiler triple")
include("${CMAKE_CURRENT_LIST_DIR}/ppcat.toolchain.cmake")
set(CMAKE_SYSTEM_PROCESSOR ppc64le)
set(GNU_MACHINE "powerpc64le-linux-gnu" CACHE STRING "GNU compiler triple")
include("${CMAKE_CURRENT_LIST_DIR}/ppcat.toolchain.cmake")
if(COMMAND toolchain_save_config)
return() # prevent recursive call
endif()
option(AT_PATH "Advance Toolchain directory" "")
option(AT_RPATH "Add new directories to runtime search path" "")
option(AT_HOST_LINK "Enable/disable Link against host advance toolchain runtime" OFF)
option(AT_NO_AUTOVEC "Disable/enable Auto Vectorizer optimization" OFF)
set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_VERSION 1)
include("${CMAKE_CURRENT_LIST_DIR}/gnu.toolchain.cmake")
if(NOT DEFINED CMAKE_C_COMPILER)
string(REGEX REPLACE "/+$" "" AT_PATH "${AT_PATH}")
if(NOT AT_PATH)
message(FATAL_ERROR "'AT_PATH' option is required. Please set it to Advance Toolchain path to get toolchain works")
endif()
if(NOT EXISTS ${AT_PATH})
message(FATAL_ERROR "'${AT_PATH}' Advance Toolchain path isn't exist")
endif()
set(CMAKE_C_COMPILER "${AT_PATH}/bin/${GNU_MACHINE}-gcc")
if(NOT EXISTS ${CMAKE_C_COMPILER})
message(FATAL_ERROR "GNU C compiler isn't exist on path '${CMAKE_C_COMPILER}'. Please install Advance Toolchain with ${CMAKE_SYSTEM_PROCESSOR} supports")
endif()
endif()
if(NOT DEFINED CMAKE_CXX_COMPILER)
set(CMAKE_CXX_COMPILER "${AT_PATH}/bin/${GNU_MACHINE}-g++")
if(NOT EXISTS ${CMAKE_CXX_COMPILER})
message(FATAL_ERROR "GNU C++ compiler isn't exist. Invalid install of Advance Toolchain")
endif()
endif()
if(NOT DEFINED AT_GCCROOT_PATH)
set(AT_GCCROOT_PATH "${AT_PATH}/${GNU_MACHINE}")
if(NOT EXISTS ${AT_GCCROOT_PATH})
message(FATAL_ERROR "GCC root path '${AT_GCCROOT_PATH}' isn't exist. Invalid install of Advance Toolchain")
endif()
endif()
if(NOT DEFINED AT_SYSROOT_PATH)
if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "ppc64")
set(AT_SYSROOT_PATH "${AT_PATH}/ppc")
else()
set(AT_SYSROOT_PATH "${AT_PATH}/${CMAKE_SYSTEM_PROCESSOR}")
endif()
if(NOT EXISTS ${AT_SYSROOT_PATH})
message(FATAL_ERROR "System root path '${AT_SYSROOT_PATH}' isn't exist. Invalid install of Advance Toolchain")
endif()
endif()
if(NOT DEFINED CMAKE_EXE_LINKER_FLAGS)
set(CMAKE_CXX_FLAGS "" CACHE INTERAL "")
set(CMAKE_C_FLAGS "" CACHE INTERAL "")
set(CMAKE_EXE_LINKER_FLAGS "" CACHE INTERAL "")
set(CMAKE_SHARED_LINKER_FLAGS "" CACHE INTERAL "")
set(CMAKE_MODULE_LINKER_FLAGS "" CACHE INTERAL "")
if(AT_RPATH)
string(REPLACE "," ";" RPATH_LIST ${AT_RPATH})
endif()
if(AT_HOST_LINK)
#get 64-bit dynamic linker path
file(STRINGS "${AT_SYSROOT_PATH}/usr/bin/ldd" RTLDLIST LIMIT_COUNT 1 REGEX "^RTLDLIST=[\"*\"]")
string(REGEX REPLACE "RTLDLIST=|\"" "" RTLDLIST "${RTLDLIST}")
string(REPLACE " " ";" RTLDLIST "${RTLDLIST}")
#RTLDLIST must contains 32 and 64 bit paths
list(LENGTH RTLDLIST RTLDLIST_LEN)
if(NOT RTLDLIST_LEN GREATER 1)
message(FATAL_ERROR "Could not fetch dynamic linker path. Invalid install of Advance Toolchain")
endif()
list (GET RTLDLIST 1 LINKER_PATH)
set(CMAKE_EXE_LINKER_FLAGS "-Wl,--dynamic-linker=${AT_SYSROOT_PATH}${LINKER_PATH}")
list(APPEND RPATH_LIST "${AT_GCCROOT_PATH}/lib64/")
list(APPEND RPATH_LIST "${AT_SYSROOT_PATH}/lib64/")
list(APPEND RPATH_LIST "${AT_SYSROOT_PATH}/usr/lib64/")
list(APPEND RPATH_LIST "${PROJECT_BINARY_DIR}/lib/")
endif()
list(LENGTH RPATH_LIST RPATH_LEN)
if(RPATH_LEN GREATER 0)
set(AT_LINKER_FLAGS "${AT_LINKER_FLAGS} -Wl")
foreach(RPATH ${RPATH_LIST})
set(AT_LINKER_FLAGS "${AT_LINKER_FLAGS},-rpath,${RPATH}")
endforeach()
endif()
set(CMAKE_SHARED_LINKER_FLAGS "${AT_LINKER_FLAGS} ${CMAKE_SHARED_LINKER_FLAGS}")
set(CMAKE_MODULE_LINKER_FLAGS "${AT_LINKER_FLAGS} ${CMAKE_MODULE_LINKER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${AT_LINKER_FLAGS} ${CMAKE_EXE_LINKER_FLAGS}")
if(AT_NO_AUTOVEC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-tree-vectorize -fno-tree-slp-vectorize")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-tree-vectorize -fno-tree-slp-vectorize")
endif()
endif()
set(CMAKE_FIND_ROOT_PATH ${CMAKE_FIND_ROOT_PATH} ${AT_SYSROOT_PATH} ${AT_GCCROOT_PATH})
set(CMAKE_SYSROOT ${AT_SYSROOT_PATH})
# what about ld.gold?
if(NOT DEFINED CMAKE_LINKER)
find_program(CMAKE_LINKER NAMES ld)
endif()
if(NOT DEFINED CMAKE_AR)
find_program(CMAKE_AR NAMES ar)
endif()
set(TOOLCHAIN_CONFIG_VARS ${TOOLCHAIN_CONFIG_VARS}
CMAKE_SYSROOT
AT_SYSROOT_PATH
AT_GCCROOT_PATH
)
toolchain_save_config()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment