Commit f38a61c6 authored by Paul E. Murphy's avatar Paul E. Murphy

fast_math: implement optimized PPC routines

Implement cvRound using inline asm. No compiler support
exists today to properly optimize this. This results in
about a 4x speedup over the default rounding. Likewise,
simplify the growing number of rounding function overloads.

For P9 enabled targets, utilize the classification
testing instruction to test for Inf/Nan values. Operation
speedup is about 1.2x for FP32, and 1.5x for FP64 operands.

For P8 targets, fallback to the GCC nan inline. It provides
a 1.1/1.4x improvement for FP32/FP64 arguments.
parent 3f92bcc1
...@@ -74,7 +74,15 @@ ...@@ -74,7 +74,15 @@
# include "tegra_round.hpp" # include "tegra_round.hpp"
#endif #endif
#if defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__ && !defined(__CUDACC__) #if defined __PPC64__ && defined __GNUC__ && defined _ARCH_PWR8 && !defined (__CUDACC__)
# include <altivec.h>
#endif
#if ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \
defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION
#define CV_INLINE_ROUND_DBL(value) TEGRA_ROUND_DBL(value);
#define CV_INLINE_ROUND_FLT(value) TEGRA_ROUND_FLT(value);
#elif defined __GNUC__ && defined __arm__ && (defined __ARM_PCS_VFP || defined __ARM_VFPV3__ || defined __ARM_NEON__) && !defined __SOFTFP__ && !defined(__CUDACC__)
// 1. general scheme // 1. general scheme
#define ARM_ROUND(_value, _asm_string) \ #define ARM_ROUND(_value, _asm_string) \
int res; \ int res; \
...@@ -84,12 +92,32 @@ ...@@ -84,12 +92,32 @@
return res return res
// 2. version for double // 2. version for double
#ifdef __clang__ #ifdef __clang__
#define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]") #define CV_INLINE_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %[value] \n vmov %[res], %[temp]")
#else #else
#define ARM_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]") #define CV_INLINE_ROUND_DBL(value) ARM_ROUND(value, "vcvtr.s32.f64 %[temp], %P[value] \n vmov %[res], %[temp]")
#endif #endif
// 3. version for float // 3. version for float
#define ARM_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]") #define CV_INLINE_ROUND_FLT(value) ARM_ROUND(value, "vcvtr.s32.f32 %[temp], %[value]\n vmov %[res], %[temp]")
#elif defined __PPC64__ && defined __GNUC__ && defined _ARCH_PWR8 && !defined (__CUDACC__)
// P8 and newer machines can convert fp32/64 to int quickly.
#define CV_INLINE_ROUND_DBL(value) \
int out; \
double temp; \
__asm__( "fctiw %[temp],%[in]\n\tmffprwz %[out],%[temp]\n\t" : [out] "=r" (out), [temp] "=d" (temp) : [in] "d" ((double)(value)) : ); \
return out;
// FP32 also works with FP64 routine above
#define CV_INLINE_ROUND_FLT(value) CV_INLINE_ROUND_DBL(value)
#ifdef _ARCH_PWR9
#define CV_INLINE_ISINF_DBL(value) return scalar_test_data_class(value, 0x30);
#define CV_INLINE_ISNAN_DBL(value) return scalar_test_data_class(value, 0x40);
#define CV_INLINE_ISINF_FLT(value) CV_INLINE_ISINF_DBL(value)
#define CV_INLINE_ISNAN_FLT(value) CV_INLINE_ISNAN_DBL(value)
#endif
#elif defined CV_ICC || defined __GNUC__
#define CV_INLINE_ROUND_DBL(value) return (int)(lrint(value));
#define CV_INLINE_ROUND_FLT(value) return (int)(lrintf(value));
#endif #endif
#if defined __PPC64__ && !defined OPENCV_USE_FASTMATH_GCC_BUILTINS #if defined __PPC64__ && !defined OPENCV_USE_FASTMATH_GCC_BUILTINS
...@@ -105,6 +133,16 @@ ...@@ -105,6 +133,16 @@
#define _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS #define _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS
#endif #endif
/* Allow overrides for some functions which may benefit from tuning. Likewise,
note that isinf is not used as the return value is signed. */
#if defined _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS && !defined CV_INLINE_ISNAN_DBL
#define CV_INLINE_ISNAN_DBL(value) return __builtin_isnan(value);
#endif
#if defined _OPENCV_FASTMATH_ENABLE_GCC_MATH_BUILTINS && !defined CV_INLINE_ISNAN_FLT
#define CV_INLINE_ISNAN_FLT(value) return __builtin_isnanf(value);
#endif
/** @brief Rounds floating-point number to the nearest integer /** @brief Rounds floating-point number to the nearest integer
@param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the
...@@ -125,15 +163,8 @@ cvRound( double value ) ...@@ -125,15 +163,8 @@ cvRound( double value )
fistp t; fistp t;
} }
return t; return t;
#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \ #elif defined CV_INLINE_ROUND_DBL
defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION CV_INLINE_ROUND_DBL(value);
TEGRA_ROUND_DBL(value);
#elif defined CV_ICC || defined __GNUC__
# if defined ARM_ROUND_DBL
ARM_ROUND_DBL(value);
# else
return (int)lrint(value);
# endif
#else #else
/* it's ok if round does not comply with IEEE754 standard; /* it's ok if round does not comply with IEEE754 standard;
the tests should allow +/-1 difference when the tested functions use round */ the tests should allow +/-1 difference when the tested functions use round */
...@@ -184,10 +215,14 @@ CV_INLINE int cvCeil( double value ) ...@@ -184,10 +215,14 @@ CV_INLINE int cvCeil( double value )
otherwise. */ otherwise. */
CV_INLINE int cvIsNaN( double value ) CV_INLINE int cvIsNaN( double value )
{ {
#if defined CV_INLINE_ISNAN_DBL
CV_INLINE_ISNAN_DBL(value);
#else
Cv64suf ieee754; Cv64suf ieee754;
ieee754.f = value; ieee754.f = value;
return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) + return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) +
((unsigned)ieee754.u != 0) > 0x7ff00000; ((unsigned)ieee754.u != 0) > 0x7ff00000;
#endif
} }
/** @brief Determines if the argument is Infinity. /** @brief Determines if the argument is Infinity.
...@@ -198,10 +233,14 @@ CV_INLINE int cvIsNaN( double value ) ...@@ -198,10 +233,14 @@ CV_INLINE int cvIsNaN( double value )
and 0 otherwise. */ and 0 otherwise. */
CV_INLINE int cvIsInf( double value ) CV_INLINE int cvIsInf( double value )
{ {
#if defined CV_INLINE_ISINF_DBL
CV_INLINE_ISINF_DBL(value);
#else
Cv64suf ieee754; Cv64suf ieee754;
ieee754.f = value; ieee754.f = value;
return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) == 0x7ff00000 && return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) == 0x7ff00000 &&
(unsigned)ieee754.u == 0; (unsigned)ieee754.u == 0;
#endif
} }
#ifdef __cplusplus #ifdef __cplusplus
...@@ -221,15 +260,8 @@ CV_INLINE int cvRound(float value) ...@@ -221,15 +260,8 @@ CV_INLINE int cvRound(float value)
fistp t; fistp t;
} }
return t; return t;
#elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \ #elif defined CV_INLINE_ROUND_FLT
defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION CV_INLINE_ROUND_FLT(value);
TEGRA_ROUND_FLT(value);
#elif defined CV_ICC || defined __GNUC__
# if defined ARM_ROUND_FLT
ARM_ROUND_FLT(value);
# else
return (int)lrintf(value);
# endif
#else #else
/* it's ok if round does not comply with IEEE754 standard; /* it's ok if round does not comply with IEEE754 standard;
the tests should allow +/-1 difference when the tested functions use round */ the tests should allow +/-1 difference when the tested functions use round */
...@@ -280,17 +312,25 @@ CV_INLINE int cvCeil( int value ) ...@@ -280,17 +312,25 @@ CV_INLINE int cvCeil( int value )
/** @overload */ /** @overload */
CV_INLINE int cvIsNaN( float value ) CV_INLINE int cvIsNaN( float value )
{ {
#if defined CV_INLINE_ISNAN_FLT
CV_INLINE_ISNAN_FLT(value);
#else
Cv32suf ieee754; Cv32suf ieee754;
ieee754.f = value; ieee754.f = value;
return (ieee754.u & 0x7fffffff) > 0x7f800000; return (ieee754.u & 0x7fffffff) > 0x7f800000;
#endif
} }
/** @overload */ /** @overload */
CV_INLINE int cvIsInf( float value ) CV_INLINE int cvIsInf( float value )
{ {
#if defined CV_INLINE_ISINF_FLT
CV_INLINE_ISINF_FLT(value);
#else
Cv32suf ieee754; Cv32suf ieee754;
ieee754.f = value; ieee754.f = value;
return (ieee754.u & 0x7fffffff) == 0x7f800000; return (ieee754.u & 0x7fffffff) == 0x7f800000;
#endif
} }
#endif // __cplusplus #endif // __cplusplus
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment