Merge pull request #14110 from seiko2plus:core_vsx_fp16

4e60db90 · Alexander Alekhin · a8e635f1 · f4135968 · 4e60db90 · 4e60db90
Commit 4e60db90 authored Mar 20, 2019 by Alexander Alekhin
5 changed files
--- a/cmake/OpenCVCompilerOptions.cmake
+++ b/cmake/OpenCVCompilerOptions.cmake
@@ -294,11 +294,18 @@ endif()
 # workaround gcc bug for aligned ld/st
 # https://github.com/opencv/opencv/issues/13211
 if((PPC64LE AND NOT CMAKE_CROSSCOMPILING) OR OPENCV_FORCE_COMPILER_CHECK_VSX_ALIGNED)
-  ocv_check_runtime_flag("${CPU_BASELINE_FLAGS}" "OPENCV_CHECK_VSX_ALIGNED" "${OpenCV_SOURCE_DIR}/cmake/checks/runtime/cpu_vsx_aligned.cpp")
+  ocv_check_runtime_flag("${CPU_BASELINE_FLAGS}" OPENCV_CHECK_VSX_ALIGNED "${OpenCV_SOURCE_DIR}/cmake/checks/runtime/cpu_vsx_aligned.cpp")
  if(NOT OPENCV_CHECK_VSX_ALIGNED)
    add_extra_compiler_option_force(-DCV_COMPILER_VSX_BROKEN_ALIGNED)
  endif()
 endif()
+# validate inline asm with fixes register number and constraints wa, wd, wf
+if(PPC64LE)
+  ocv_check_compiler_flag(CXX "${CPU_BASELINE_FLAGS}" OPENCV_CHECK_VSX_ASM "${OpenCV_SOURCE_DIR}/cmake/checks/cpu_vsx_asm.cpp")
+  if(NOT OPENCV_CHECK_VSX_ASM)
+    add_extra_compiler_option_force(-DCV_COMPILER_VSX_BROKEN_ASM)
+  endif()
+endif()

 # combine all "extra" options
 if(NOT OPENCV_SKIP_EXTRA_COMPILER_FLAGS)

--- a/cmake/checks/cpu_vsx_asm.cpp
+++ b/cmake/checks/cpu_vsx_asm.cpp
+#if defined(__VSX__)
+    #if defined(__PPC64__) && defined(__LITTLE_ENDIAN__)
+        #include <altivec.h>
+    #else
+        #error "OpenCV only supports little-endian mode"
+    #endif
+#else
+    #error "VSX is not supported"
+#endif
+
+/*
+ * xlc and wide versions of clang don't support %x<n> in the inline asm template which fixes register number
+ * when using any of the register constraints wa, wd, wf
+*/
+int main()
+{
+    __vector float vf;
+    __vector signed int vi;
+    __asm__ __volatile__ ("xvcvsxwsp %x0,%x1" : "=wf" (vf) : "wa" (vi));
+    return 0;
+}
\ No newline at end of file
--- a/modules/core/CMakeLists.txt
+++ b/modules/core/CMakeLists.txt
@@ -3,7 +3,7 @@ set(the_description "The Core Functionality")
 ocv_add_dispatched_file(mathfuncs_core SSE2 AVX AVX2)
 ocv_add_dispatched_file(stat SSE4_2 AVX2)
 ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3)
-ocv_add_dispatched_file(convert SSE2 AVX2)
+ocv_add_dispatched_file(convert SSE2 AVX2 VSX3)
 ocv_add_dispatched_file(convert_scale SSE2 AVX2)
 ocv_add_dispatched_file(count_non_zero SSE2 AVX2)
 ocv_add_dispatched_file(matmul SSE2 AVX2)

--- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp
@@ -11,11 +11,6 @@
 #define CV_SIMD128 1
 #define CV_SIMD128_64F 1

-/**
- * todo: supporting half precision for power9
- * convert instractions xvcvhpsp, xvcvsphp
-**/
-
 namespace cv
 {

@@ -1203,20 +1198,62 @@ inline v_float32x4 v_pack_triplets(const v_float32x4& vec)

 /////// FP16 support ////////

-// [TODO] implement these 2 using VSX or universal intrinsics (copy from intrin_sse.cpp and adopt)
 inline v_float32x4 v_load_expand(const float16_t* ptr)
 {
-    return v_float32x4((float)ptr[0], (float)ptr[1], (float)ptr[2], (float)ptr[3]);
+    vec_ushort8 vf16 = vec_ld_l8((const ushort*)ptr);
+#if CV_VSX3 && defined(vec_extract_fp_from_shorth)
+    return v_float32x4(vec_extract_fp_from_shorth(vf16));
+#elif CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
+    vec_float4 vf32;
+    __asm__ __volatile__ ("xvcvhpsp %x0,%x1" : "=wf" (vf32) : "wa" (vec_mergeh(vf16, vf16)));
+    return v_float32x4(vf32);
+#else
+    const vec_int4 z = vec_int4_z, delta = vec_int4_sp(0x38000000);
+    const vec_int4 signmask = vec_int4_sp(0x80000000);
+    const vec_int4 maxexp = vec_int4_sp(0x7c000000);
+    const vec_float4 deltaf = vec_float4_c(vec_int4_sp(0x38800000));
+
+    vec_int4 bits = vec_int4_c(vec_mergeh(vec_short8_c(z), vec_short8_c(vf16)));
+    vec_int4 e = vec_and(bits, maxexp), sign = vec_and(bits, signmask);
+    vec_int4 t = vec_add(vec_sr(vec_xor(bits, sign), vec_uint4_sp(3)), delta); // ((h & 0x7fff) << 13) + delta
+    vec_int4 zt = vec_int4_c(vec_sub(vec_float4_c(vec_add(t, vec_int4_sp(1 << 23))), deltaf));
+
+    t = vec_add(t, vec_and(delta, vec_cmpeq(maxexp, e)));
+    vec_bint4 zmask = vec_cmpeq(e, z);
+    vec_int4 ft = vec_sel(t, zt, zmask);
+    return v_float32x4(vec_float4_c(vec_or(ft, sign)));
+#endif
 }

 inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
 {
-    float CV_DECL_ALIGNED(32) f[4];
-    v_store_aligned(f, v);
-    ptr[0] = float16_t(f[0]);
-    ptr[1] = float16_t(f[1]);
-    ptr[2] = float16_t(f[2]);
-    ptr[3] = float16_t(f[3]);
+// fixme: Is there any buitin op or intrinsic that cover "xvcvsphp"?
+#if CV_VSX3 && !defined(CV_COMPILER_VSX_BROKEN_ASM)
+    vec_ushort8 vf16;
+    __asm__ __volatile__ ("xvcvsphp %x0,%x1" : "=wa" (vf16) : "wf" (v.val));
+    vec_st_l8(vec_mergesqe(vf16, vf16), ptr);
+#else
+    const vec_int4 signmask = vec_int4_sp(0x80000000);
+    const vec_int4 rval = vec_int4_sp(0x3f000000);
+
+    vec_int4 t = vec_int4_c(v.val);
+    vec_int4 sign = vec_sra(vec_and(t, signmask), vec_uint4_sp(16));
+    t = vec_and(vec_nor(signmask, signmask), t);
+
+    vec_bint4 finitemask = vec_cmpgt(vec_int4_sp(0x47800000), t);
+    vec_bint4 isnan = vec_cmpgt(t, vec_int4_sp(0x7f800000));
+    vec_int4 naninf = vec_sel(vec_int4_sp(0x7c00), vec_int4_sp(0x7e00), isnan);
+    vec_bint4 tinymask = vec_cmpgt(vec_int4_sp(0x38800000), t);
+    vec_int4 tt = vec_int4_c(vec_add(vec_float4_c(t), vec_float4_c(rval)));
+    tt = vec_sub(tt, rval);
+    vec_int4 odd = vec_and(vec_sr(t, vec_uint4_sp(13)), vec_int4_sp(1));
+    vec_int4 nt = vec_add(t, vec_int4_sp(0xc8000fff));
+    nt = vec_sr(vec_add(nt, odd), vec_uint4_sp(13));
+    t = vec_sel(nt, tt, tinymask);
+    t = vec_sel(naninf, t, finitemask);
+    t = vec_or(t, sign);
+    vec_st_l8(vec_packs(t, t), ptr);
+#endif
 }

 inline void v_cleanup() {}

--- a/modules/core/include/opencv2/core/vsx_utils.hpp
+++ b/modules/core/include/opencv2/core/vsx_utils.hpp
@@ -291,6 +291,8 @@ VSX_IMPL_1RG(vec_udword2, wi, vec_float4,  wf, xvcvspuxds, vec_ctulo)
 *
 * So we're not able to use inline asm and only use built-in functions that CLANG supports
 * and use __builtin_convertvector if clang missng any of vector conversions built-in functions
+ *
+ * todo: clang asm template bug is fixed, need to reconsider the current workarounds.
 */

 // convert vector helper