Commit 4e60db90 authored by Alexander Alekhin's avatar Alexander Alekhin

Merge pull request #14110 from seiko2plus:core_vsx_fp16

parents a8e635f1 f4135968
...@@ -294,11 +294,18 @@ endif() ...@@ -294,11 +294,18 @@ endif()
# workaround gcc bug for aligned ld/st # workaround gcc bug for aligned ld/st
# https://github.com/opencv/opencv/issues/13211 # https://github.com/opencv/opencv/issues/13211
if((PPC64LE AND NOT CMAKE_CROSSCOMPILING) OR OPENCV_FORCE_COMPILER_CHECK_VSX_ALIGNED) if((PPC64LE AND NOT CMAKE_CROSSCOMPILING) OR OPENCV_FORCE_COMPILER_CHECK_VSX_ALIGNED)
ocv_check_runtime_flag("${CPU_BASELINE_FLAGS}" "OPENCV_CHECK_VSX_ALIGNED" "${OpenCV_SOURCE_DIR}/cmake/checks/runtime/cpu_vsx_aligned.cpp") ocv_check_runtime_flag("${CPU_BASELINE_FLAGS}" OPENCV_CHECK_VSX_ALIGNED "${OpenCV_SOURCE_DIR}/cmake/checks/runtime/cpu_vsx_aligned.cpp")
if(NOT OPENCV_CHECK_VSX_ALIGNED) if(NOT OPENCV_CHECK_VSX_ALIGNED)
add_extra_compiler_option_force(-DCV_COMPILER_VSX_BROKEN_ALIGNED) add_extra_compiler_option_force(-DCV_COMPILER_VSX_BROKEN_ALIGNED)
endif() endif()
endif() endif()
# validate inline asm with fixes register number and constraints wa, wd, wf
if(PPC64LE)
ocv_check_compiler_flag(CXX "${CPU_BASELINE_FLAGS}" OPENCV_CHECK_VSX_ASM "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_vsx_asm.cpp")
if(NOT OPENCV_CHECK_VSX_ASM)
add_extra_compiler_option_force(-DCV_COMPILER_VSX_BROKEN_ASM)
endif()
endif()
# combine all "extra" options # combine all "extra" options
if(NOT OPENCV_SKIP_EXTRA_COMPILER_FLAGS) if(NOT OPENCV_SKIP_EXTRA_COMPILER_FLAGS)
......
#if defined(__VSX__)
#if defined(__PPC64__) && defined(__LITTLE_ENDIAN__)
#include <altivec.h>
#else
#error "OpenCV only supports little-endian mode"
#endif
#else
#error "VSX is not supported"
#endif
/*
* xlc and wide versions of clang don't support %x<n> in the inline asm template which fixes register number
* when using any of the register constraints wa, wd, wf
*/
int main()
{
__vector float vf;
__vector signed int vi;
__asm__ __volatile__ ("xvcvsxwsp %x0,%x1" : "=wf" (vf) : "wa" (vi));
return 0;
}
\ No newline at end of file
...@@ -3,7 +3,7 @@ set(the_description "The Core Functionality") ...@@ -3,7 +3,7 @@ set(the_description "The Core Functionality")
ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2) ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2)
ocv_add_dispatched_file(stat SSE4_2 AVX2) ocv_add_dispatched_file(stat SSE4_2 AVX2)
ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3) ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3)
ocv_add_dispatched_file(convert SSE2 AVX2) ocv_add_dispatched_file(convert SSE2 AVX2 VSX3)
ocv_add_dispatched_file(convert_scale SSE2 AVX2) ocv_add_dispatched_file(convert_scale SSE2 AVX2)
ocv_add_dispatched_file(count_non_zero SSE2 AVX2) ocv_add_dispatched_file(count_non_zero SSE2 AVX2)
ocv_add_dispatched_file(matmul SSE2 AVX2) ocv_add_dispatched_file(matmul SSE2 AVX2)
......
...@@ -11,11 +11,6 @@ ...@@ -11,11 +11,6 @@
#define CV_SIMD128 1 #define CV_SIMD128 1
#define CV_SIMD128_64F 1 #define CV_SIMD128_64F 1
/**
* todo: supporting half precision for power9
* convert instractions xvcvhpsp, xvcvsphp
**/
namespace cv namespace cv
{ {
...@@ -1203,20 +1198,62 @@ inline v_float32x4 v_pack_triplets(const v_float32x4& vec) ...@@ -1203,20 +1198,62 @@ inline v_float32x4 v_pack_triplets(const v_float32x4& vec)
/////// FP16 support //////// /////// FP16 support ////////
// [TODO] implement these 2 using VSX or universal intrinsics (copy from intrin_sse.cpp and adopt)
inline v_float32x4 v_load_expand(const float16_t* ptr) inline v_float32x4 v_load_expand(const float16_t* ptr)
{ {
return v_float32x4((float)ptr[0], (float)ptr[1], (float)ptr[2], (float)ptr[3]); vec_ushort8 vf16 = vec_ld_l8((const ushort*)ptr);
#if CV_VSX3 && defined(vec_extract_fp_from_shorth)
return v_float32x4(vec_extract_fp_from_shorth(vf16));
#elif CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
vec_float4 vf32;
__asm__ __volatile__ ("xvcvhpsp %x0,%x1" : "=wf" (vf32) : "wa" (vec_mergeh(vf16, vf16)));
return v_float32x4(vf32);
#else
const vec_int4 z = vec_int4_z, delta = vec_int4_sp(0x38000000);
const vec_int4 signmask = vec_int4_sp(0x80000000);
const vec_int4 maxexp = vec_int4_sp(0x7c000000);
const vec_float4 deltaf = vec_float4_c(vec_int4_sp(0x38800000));
vec_int4 bits = vec_int4_c(vec_mergeh(vec_short8_c(z), vec_short8_c(vf16)));
vec_int4 e = vec_and(bits, maxexp), sign = vec_and(bits, signmask);
vec_int4 t = vec_add(vec_sr(vec_xor(bits, sign), vec_uint4_sp(3)), delta); // ((h & 0x7fff) << 13) + delta
vec_int4 zt = vec_int4_c(vec_sub(vec_float4_c(vec_add(t, vec_int4_sp(1 << 23))), deltaf));
t = vec_add(t, vec_and(delta, vec_cmpeq(maxexp, e)));
vec_bint4 zmask = vec_cmpeq(e, z);
vec_int4 ft = vec_sel(t, zt, zmask);
return v_float32x4(vec_float4_c(vec_or(ft, sign)));
#endif
} }
inline void v_pack_store(float16_t* ptr, const v_float32x4& v) inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
{ {
float CV_DECL_ALIGNED(32) f[4]; // fixme: Is there any buitin op or intrinsic that cover "xvcvsphp"?
v_store_aligned(f, v); #if CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
ptr[0] = float16_t(f[0]); vec_ushort8 vf16;
ptr[1] = float16_t(f[1]); __asm__ __volatile__ ("xvcvsphp %x0,%x1" : "=wa" (vf16) : "wf" (v.val));
ptr[2] = float16_t(f[2]); vec_st_l8(vec_mergesqe(vf16, vf16), ptr);
ptr[3] = float16_t(f[3]); #else
const vec_int4 signmask = vec_int4_sp(0x80000000);
const vec_int4 rval = vec_int4_sp(0x3f000000);
vec_int4 t = vec_int4_c(v.val);
vec_int4 sign = vec_sra(vec_and(t, signmask), vec_uint4_sp(16));
t = vec_and(vec_nor(signmask, signmask), t);
vec_bint4 finitemask = vec_cmpgt(vec_int4_sp(0x47800000), t);
vec_bint4 isnan = vec_cmpgt(t, vec_int4_sp(0x7f800000));
vec_int4 naninf = vec_sel(vec_int4_sp(0x7c00), vec_int4_sp(0x7e00), isnan);
vec_bint4 tinymask = vec_cmpgt(vec_int4_sp(0x38800000), t);
vec_int4 tt = vec_int4_c(vec_add(vec_float4_c(t), vec_float4_c(rval)));
tt = vec_sub(tt, rval);
vec_int4 odd = vec_and(vec_sr(t, vec_uint4_sp(13)), vec_int4_sp(1));
vec_int4 nt = vec_add(t, vec_int4_sp(0xc8000fff));
nt = vec_sr(vec_add(nt, odd), vec_uint4_sp(13));
t = vec_sel(nt, tt, tinymask);
t = vec_sel(naninf, t, finitemask);
t = vec_or(t, sign);
vec_st_l8(vec_packs(t, t), ptr);
#endif
} }
inline void v_cleanup() {} inline void v_cleanup() {}
......
...@@ -291,6 +291,8 @@ VSX_IMPL_1RG(vec_udword2, wi, vec_float4, wf, xvcvspuxds, vec_ctulo) ...@@ -291,6 +291,8 @@ VSX_IMPL_1RG(vec_udword2, wi, vec_float4, wf, xvcvspuxds, vec_ctulo)
* *
* So we're not able to use inline asm and only use built-in functions that CLANG supports * So we're not able to use inline asm and only use built-in functions that CLANG supports
* and use __builtin_convertvector if clang missng any of vector conversions built-in functions * and use __builtin_convertvector if clang missng any of vector conversions built-in functions
*
* todo: clang asm template bug is fixed, need to reconsider the current workarounds.
*/ */
// convert vector helper // convert vector helper
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment