Commit 80b62a41 authored by Vadim Pisarevsky's avatar Vadim Pisarevsky Committed by Alexander Alekhin

Merge pull request #12411 from vpisarev:wide_convert

* rewrote Mat::convertTo() and convertScaleAbs() to wide universal intrinsics; added always-available and SIMD-optimized FP16<=>FP32 conversion

* fixed compile warnings

* fix some more compile errors

* slightly relaxed accuracy threshold for int->float conversion (since we now do it using single-precision arithmetics, not double-precision)

* fixed compile errors on iOS, Android and in the baseline C++ version (intrin_cpp.hpp)

* trying to fix ARM-neon builds

* trying to fix ARM-neon builds

* trying to fix ARM-neon builds

* trying to fix ARM-neon builds
parent 54279523
......@@ -219,15 +219,10 @@ enum CpuFeatures {
typedef union Cv16suf
{
short i;
ushort u;
#if CV_FP16_TYPE
__fp16 h;
#endif
struct _fp16Format
{
unsigned int significand : 10;
unsigned int exponent : 5;
unsigned int sign : 1;
} fmt;
}
Cv16suf;
......@@ -236,12 +231,6 @@ typedef union Cv32suf
int i;
unsigned u;
float f;
struct _fp32Format
{
unsigned int significand : 23;
unsigned int exponent : 8;
unsigned int sign : 1;
} fmt;
}
Cv32suf;
......@@ -548,6 +537,115 @@ typedef ::uint64_t uint64_t;
#include <stdint.h>
#endif
#ifdef __cplusplus
namespace cv
{
class float16_t
{
public:
#if CV_FP16_TYPE
float16_t() {}
explicit float16_t(float x) { h = (__fp16)x; }
operator float() const { return (float)h; }
static float16_t fromBits(ushort w)
{
Cv16suf u;
u.u = w;
float16_t result;
result.h = u.h;
return result;
}
static float16_t zero()
{
float16_t result;
result.h = (__fp16)0;
return result;
}
ushort bits() const
{
Cv16suf u;
u.h = h;
return u.u;
}
protected:
__fp16 h;
#else
float16_t() {}
explicit float16_t(float x)
{
#if CV_AVX2
__m128 v = _mm_load_ss(&x);
w = (ushort)_mm_cvtsi128_si32(_mm_cvtps_ph(v, 0));
#else
Cv32suf in;
in.f = x;
unsigned sign = in.u & 0x80000000;
in.u ^= sign;
if( in.u >= 0x47800000 )
w = (ushort)(in.u > 0x7f800000 ? 0x7e00 : 0x7c00);
else
{
if (in.u < 0x38800000)
{
in.f += 0.5f;
w = (ushort)(in.u - 0x3f000000);
}
else
{
unsigned t = in.u + 0xc8000fff;
w = (ushort)((t + ((in.u >> 13) & 1)) >> 13);
}
}
w = (ushort)(w | (sign >> 16));
#endif
}
operator float() const
{
#if CV_AVX2
float f;
_mm_store_ss(&f, _mm_cvtph_ps(_mm_cvtsi32_si128(w)));
return f;
#else
Cv32suf out;
unsigned t = ((w & 0x7fff) << 13) + 0x38000000;
unsigned sign = (w & 0x8000) << 16;
unsigned e = w & 0x7c00;
out.u = t + (1 << 23);
out.u = (e >= 0x7c00 ? t + 0x38000000 :
e == 0 ? (out.f -= 6.103515625e-05f, out.u) : t) | sign;
return out.f;
#endif
}
static float16_t fromBits(ushort b)
{
float16_t result;
result.w = b;
return result;
}
static float16_t zero()
{
float16_t result;
result.w = (ushort)0;
return result;
}
ushort bits() const { return w; }
protected:
ushort w;
#endif
};
}
#endif
//! @}
......
......@@ -252,7 +252,8 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(unsigned, v_uint64, prefix) \
CV_INTRIN_DEFINE_WIDE_INTRIN(float, v_float32, f32, prefix, load) \
CV_INTRIN_DEFINE_WIDE_INTRIN(int64, v_int64, s64, prefix, load) \
CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load)
CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load) \
CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(float16_t, v_float32, prefix)
template<typename _Tp> struct V_RegTraits
{
......@@ -286,9 +287,6 @@ template<typename _Tp> struct V_RegTraits
#if CV_SIMD128_64F
CV_DEF_REG_TRAITS(v, v_float64x2, double, f64, v_float64x2, void, void, v_int64x2, v_int32x4);
#endif
#if CV_SIMD128_FP16
CV_DEF_REG_TRAITS(v, v_float16x8, short, f16, v_float16x8, void, void, v_int16x8, v_int16x8);
#endif
#endif
#if CV_SIMD256
......@@ -302,9 +300,6 @@ template<typename _Tp> struct V_RegTraits
CV_DEF_REG_TRAITS(v256, v_uint64x4, uint64, u64, v_uint64x4, void, void, v_int64x4, void);
CV_DEF_REG_TRAITS(v256, v_int64x4, int64, s64, v_uint64x4, void, void, v_int64x4, void);
CV_DEF_REG_TRAITS(v256, v_float64x4, double, f64, v_float64x4, void, void, v_int64x4, v_int32x8);
#if CV_SIMD256_FP16
CV_DEF_REG_TRAITS(v256, v_float16x16, short, f16, v_float16x16, void, void, v_int16x16, void);
#endif
#endif
#if CV_SIMD512 && (!defined(CV__SIMD_FORCE_WIDTH) || CV__SIMD_FORCE_WIDTH == 512)
......@@ -335,14 +330,6 @@ namespace CV__SIMD_NAMESPACE {
#if CV_SIMD256_64F
typedef v_float64x4 v_float64;
#endif
#if CV_FP16
#define vx_load_fp16_f32 v256_load_fp16_f32
#define vx_store_fp16 v_store_fp16
#endif
#if CV_SIMD256_FP16
typedef v_float16x16 v_float16;
CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v256, load_f16)
#endif
CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load)
inline void vx_cleanup() { v256_cleanup(); }
......@@ -353,7 +340,6 @@ using namespace CV__SIMD_NAMESPACE;
namespace CV__SIMD_NAMESPACE {
#define CV_SIMD CV_SIMD128
#define CV_SIMD_64F CV_SIMD128_64F
#define CV_SIMD_FP16 CV_SIMD128_FP16
#define CV_SIMD_WIDTH 16
typedef v_uint8x16 v_uint8;
typedef v_int8x16 v_int8;
......@@ -367,14 +353,6 @@ namespace CV__SIMD_NAMESPACE {
#if CV_SIMD128_64F
typedef v_float64x2 v_float64;
#endif
#if CV_FP16
#define vx_load_fp16_f32 v128_load_fp16_f32
#define vx_store_fp16 v_store_fp16
#endif
#if CV_SIMD128_FP16
typedef v_float16x8 v_float16;
CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_float16, f16, v, load_f16)
#endif
CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v)
#if CV_SIMD128_64F
CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v, load)
......
......@@ -1414,10 +1414,17 @@ inline v_int8x32 v_pack(const v_int16x16& a, const v_int16x16& b)
{ return v_int8x32(_v256_shuffle_odd_64(_mm256_packs_epi16(a.val, b.val))); }
inline v_uint8x32 v_pack(const v_uint16x16& a, const v_uint16x16& b)
{ return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val))); }
{
__m256i t = _mm256_set1_epi16(255);
__m256i a1 = _mm256_min_epu16(a.val, t);
__m256i b1 = _mm256_min_epu16(b.val, t);
return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a1, b1)));
}
inline v_uint8x32 v_pack_u(const v_int16x16& a, const v_int16x16& b)
{ return v_pack(v_reinterpret_as_u16(a), v_reinterpret_as_u16(b)); }
{
return v_uint8x32(_v256_shuffle_odd_64(_mm256_packus_epi16(a.val, b.val)));
}
inline void v_pack_store(schar* ptr, const v_int16x16& a)
{ v_store_low(ptr, v_pack(a, a)); }
......@@ -2390,6 +2397,18 @@ OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float32x8, float, f32, v_uint32x8, un
OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_int64x4, int64, s64, v_uint64x4, uint64, u64)
OPENCV_HAL_IMPL_AVX_LOADSTORE_INTERLEAVE(v_float64x4, double, f64, v_uint64x4, uint64, u64)
// FP16
inline v_float32x8 v256_load_expand(const float16_t* ptr)
{
return v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
}
inline void v_pack_store(float16_t* ptr, const v_float32x8& a)
{
__m128i ah = _mm256_cvtps_ph(a.val, 0);
_mm_storeu_si128((__m128i*)ptr, ah);
}
inline void v256_cleanup() { _mm256_zeroupper(); }
//! @name Check SIMD256 support
......
......@@ -2062,6 +2062,28 @@ inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
}
////// FP16 suport ///////
inline v_reg<float, V_TypeTraits<float>::nlanes128>
v_load_expand(const float16_t* ptr)
{
v_reg<float, V_TypeTraits<float>::nlanes128> v;
for( int i = 0; i < v.nlanes; i++ )
{
v.s[i] = ptr[i];
}
return v;
}
inline void
v_pack_store(float16_t* ptr, v_reg<float, V_TypeTraits<float>::nlanes128>& v)
{
for( int i = 0; i < v.nlanes; i++ )
{
ptr[i] = float16_t(v.s[i]);
}
}
inline void v_cleanup() {}
//! @}
......
......@@ -62,15 +62,6 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#define CV_SIMD128_64F 0
#endif
#ifndef CV_SIMD128_FP16
# if CV_FP16 && (defined(__GNUC__) && __GNUC__ >= 5) // #12027: float16x8_t is missing in GCC 4.8.2
# define CV_SIMD128_FP16 1
# endif
#endif
#ifndef CV_SIMD128_FP16
# define CV_SIMD128_FP16 0
#endif
#if CV_SIMD128_64F
#define OPENCV_HAL_IMPL_NEON_REINTERPRET(_Tpv, suffix) \
template <typename T> static inline \
......@@ -329,53 +320,6 @@ inline void v_store_fp16(short* ptr, const v_float32x4& a)
}
#endif
#if CV_SIMD128_FP16
// Workaround for old compilers
static inline int16x8_t vreinterpretq_s16_f16(float16x8_t a) { return (int16x8_t)a; }
static inline float16x8_t vreinterpretq_f16_s16(int16x8_t a) { return (float16x8_t)a; }
static inline float16x8_t cv_vld1q_f16(const void* ptr)
{
#ifndef vld1q_f16 // APPLE compiler defines vld1_f16 as macro
return vreinterpretq_f16_s16(vld1q_s16((const short*)ptr));
#else
return vld1q_f16((const __fp16*)ptr);
#endif
}
static inline void cv_vst1q_f16(void* ptr, float16x8_t a)
{
#ifndef vst1q_f16 // APPLE compiler defines vst1_f16 as macro
vst1q_s16((short*)ptr, vreinterpretq_s16_f16(a));
#else
vst1q_f16((__fp16*)ptr, a);
#endif
}
struct v_float16x8
{
typedef short lane_type;
enum { nlanes = 8 };
v_float16x8() {}
explicit v_float16x8(float16x8_t v) : val(v) {}
v_float16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
{
short v[] = {v0, v1, v2, v3, v4, v5, v6, v7};
val = cv_vld1q_f16(v);
}
short get0() const
{
return vgetq_lane_s16(vreinterpretq_s16_f16(val), 0);
}
float16x8_t val;
};
inline v_float16x8 v_setzero_f16() { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16((short)0))); }
inline v_float16x8 v_setall_f16(short v) { return v_float16x8(vreinterpretq_f16_s16(vdupq_n_s16(v))); }
#endif // CV_SIMD128_FP16
#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
......@@ -934,24 +878,6 @@ OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float32x4, float, f32)
OPENCV_HAL_IMPL_NEON_LOADSTORE_OP(v_float64x2, double, f64)
#endif
#if CV_SIMD128_FP16
// Workaround for old comiplers
inline v_float16x8 v_load_f16(const short* ptr)
{ return v_float16x8(cv_vld1q_f16(ptr)); }
inline v_float16x8 v_load_f16_aligned(const short* ptr)
{ return v_float16x8(cv_vld1q_f16(ptr)); }
inline v_float16x8 v_load_f16_low(const short* ptr)
{ return v_float16x8(vcombine_f16(cv_vld1_f16(ptr), vdup_n_f16((float16_t)0))); }
inline v_float16x8 v_load_f16_halves(const short* ptr0, const short* ptr1)
{ return v_float16x8(vcombine_f16(cv_vld1_f16(ptr0), cv_vld1_f16(ptr1))); }
inline void v_store(short* ptr, const v_float16x8& a)
{ cv_vst1q_f16(ptr, a.val); }
inline void v_store_aligned(short* ptr, const v_float16x8& a)
{ cv_vst1q_f16(ptr, a.val); }
#endif
#define OPENCV_HAL_IMPL_NEON_REDUCE_OP_8(_Tpvec, _Tpnvec, scalartype, func, vectorfunc, suffix) \
inline scalartype v_reduce_##func(const _Tpvec& a) \
{ \
......@@ -1507,22 +1433,6 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
}
#endif
#if CV_SIMD128_FP16
inline v_float32x4 v_cvt_f32(const v_float16x8& a)
{
return v_float32x4(vcvt_f32_f16(vget_low_f16(a.val)));
}
inline v_float32x4 v_cvt_f32_high(const v_float16x8& a)
{
return v_float32x4(vcvt_f32_f16(vget_high_f16(a.val)));
}
inline v_float16x8 v_cvt_f16(const v_float32x4& a, const v_float32x4& b)
{
return v_float16x8(vcombine_f16(vcvt_f16_f32(a.val), vcvt_f16_f32(b.val)));
}
#endif
////////////// Lookup table access ////////////////////
inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
......@@ -1588,6 +1498,47 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
}
#endif
////// FP16 suport ///////
#if CV_FP16
inline v_float32x4 v_load_expand(const float16_t* ptr)
{
float16x4_t v =
#ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
(float16x4_t)vld1_s16((const short*)ptr);
#else
vld1_f16((const __fp16*)ptr);
#endif
return v_float32x4(vcvt_f32_f16(v));
}
inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
{
float16x4_t hv = vcvt_f16_f32(v.val);
#ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
vst1_s16((short*)ptr, (int16x4_t)hv);
#else
vst1_f16((__fp16*)ptr, hv);
#endif
}
#else
inline v_float32x4 v_load_expand(const float16_t* ptr)
{
const int N = 4;
float buf[N];
for( int i = 0; i < N; i++ ) buf[i] = (float)ptr[i];
return v_load(buf);
}
inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
{
const int N = 4;
float buf[N];
v_store(buf, v);
for( int i = 0; i < N; i++ ) ptr[i] = float16_t(buf[i]);
}
#endif
inline void v_cleanup() {}
//! @name Check SIMD support
......
......@@ -404,7 +404,7 @@ void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
{ return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
inline void v_pack_store(schar* ptr, v_int16x8& a)
inline void v_pack_store(schar* ptr, const v_int16x8& a)
{ _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
template<int n> inline
......@@ -2655,6 +2655,50 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
y = v_float64x2(_mm_unpackhi_pd(xy0, xy1));
}
////////////// FP16 support ///////////////////////////
inline v_float32x4 v_load_expand(const float16_t* ptr)
{
const __m128i z = _mm_setzero_si128(), delta = _mm_set1_epi32(0x38000000);
const __m128i signmask = _mm_set1_epi32(0x80000000), maxexp = _mm_set1_epi32(0x7c000000);
const __m128 deltaf = _mm_castsi128_ps(_mm_set1_epi32(0x38800000));
__m128i bits = _mm_unpacklo_epi16(z, _mm_loadl_epi64((const __m128i*)ptr)); // h << 16
__m128i e = _mm_and_si128(bits, maxexp), sign = _mm_and_si128(bits, signmask);
__m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_xor_si128(bits, sign), 3), delta); // ((h & 0x7fff) << 13) + delta
__m128i zt = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_add_epi32(t, _mm_set1_epi32(1 << 23))), deltaf));
t = _mm_add_epi32(t, _mm_and_si128(delta, _mm_cmpeq_epi32(maxexp, e)));
__m128i zmask = _mm_cmpeq_epi32(e, z);
__m128i ft = v_select_si128(zmask, zt, t);
return v_float32x4(_mm_castsi128_ps(_mm_or_si128(ft, sign)));
}
inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
{
const __m128i signmask = _mm_set1_epi32(0x80000000);
const __m128i rval = _mm_set1_epi32(0x3f000000);
__m128i t = _mm_castps_si128(v.val);
__m128i sign = _mm_srai_epi32(_mm_and_si128(t, signmask), 16);
t = _mm_andnot_si128(signmask, t);
__m128i finitemask = _mm_cmpgt_epi32(_mm_set1_epi32(0x47800000), t);
__m128i isnan = _mm_cmpgt_epi32(t, _mm_set1_epi32(0x7f800000));
__m128i naninf = v_select_si128(isnan, _mm_set1_epi32(0x7e00), _mm_set1_epi32(0x7c00));
__m128i tinymask = _mm_cmpgt_epi32(_mm_set1_epi32(0x38800000), t);
__m128i tt = _mm_castps_si128(_mm_add_ps(_mm_castsi128_ps(t), _mm_castsi128_ps(rval)));
tt = _mm_sub_epi32(tt, rval);
__m128i odd = _mm_and_si128(_mm_srli_epi32(t, 13), _mm_set1_epi32(1));
__m128i nt = _mm_add_epi32(t, _mm_set1_epi32(0xc8000fff));
nt = _mm_srli_epi32(_mm_add_epi32(nt, odd), 13);
t = v_select_si128(tinymask, tt, nt);
t = v_select_si128(finitemask, t, naninf);
t = _mm_or_si128(t, sign);
t = _mm_packs_epi32(t, t);
_mm_storel_epi64((__m128i*)ptr, t);
}
inline void v_cleanup() {}
//! @name Check SIMD support
......
......@@ -916,6 +916,24 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
}
/////// FP16 support ////////
// [TODO] implement these 2 using VSX or universal intrinsics (copy from intrin_sse.cpp and adopt)
inline v_float32x4 v_load_expand(const float16_t* ptr)
{
return v_float32x4((float)ptr[0], (float)ptr[1], (float)ptr[2], (float)ptr[3]);
}
inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
{
float CV_DECL_ALIGNED(32) f[4];
v_store_aligned(f, v);
ptr[0] = float16_t(f[0]);
ptr[1] = float16_t(f[1]);
ptr[2] = float16_t(f[2]);
ptr[3] = float16_t(f[3]);
}
inline void v_cleanup() {}
......
......@@ -11,6 +11,7 @@ PERF_TEST_P(Size_MatType, addWeighted, TYPICAL_MATS_ADWEIGHTED)
{
Size size = get<0>(GetParam());
int type = get<1>(GetParam());
int depth = CV_MAT_DEPTH(type);
Mat src1(size, type);
Mat src2(size, type);
double alpha = 3.75;
......@@ -21,7 +22,7 @@ PERF_TEST_P(Size_MatType, addWeighted, TYPICAL_MATS_ADWEIGHTED)
declare.in(src1, src2, dst, WARMUP_RNG).out(dst);
if (CV_MAT_DEPTH(type) == CV_32S)
if (depth == CV_32S)
{
// there might be not enough precision for integers
src1 /= 2048;
......@@ -30,7 +31,7 @@ PERF_TEST_P(Size_MatType, addWeighted, TYPICAL_MATS_ADWEIGHTED)
TEST_CYCLE() cv::addWeighted( src1, alpha, src2, beta, gamma, dst, dst.type() );
SANITY_CHECK(dst, 1);
SANITY_CHECK(dst, depth == CV_32S ? 4 : 1);
}
} // namespace
......@@ -33,7 +33,7 @@ PERF_TEST_P( Size_DepthSrc_DepthDst_Channels_alpha, convertTo,
int runs = (sz.width <= 640) ? 8 : 1;
TEST_CYCLE_MULTIRUN(runs) src.convertTo(dst, depthDst, alpha);
double eps = depthSrc <= CV_32S ? 1e-12 : (FLT_EPSILON * maxValue);
double eps = depthSrc <= CV_32S && (depthDst <= CV_32S || depthDst == CV_64F) ? 1e-12 : (FLT_EPSILON * maxValue);
eps = eps * std::max(1.0, fabs(alpha));
SANITY_CHECK(dst, eps);
}
......
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
#include "precomp.hpp"
#include "convert.hpp"
namespace cv
{
namespace opt_AVX2
{
void cvtScale_s16s32f32Line_AVX2(const short* src, int* dst, float scale, float shift, int width)
{
int x = 0;
__m256 scale256 = _mm256_set1_ps(scale);
__m256 shift256 = _mm256_set1_ps(shift);
const int shuffle = 0xD8;
for (; x <= width - 16; x += 16)
{
__m256i v_src = _mm256_loadu_si256((const __m256i *)(src + x));
v_src = _mm256_permute4x64_epi64(v_src, shuffle);
__m256i v_src_lo = _mm256_srai_epi32(_mm256_unpacklo_epi16(v_src, v_src), 16);
__m256i v_src_hi = _mm256_srai_epi32(_mm256_unpackhi_epi16(v_src, v_src), 16);
__m256 v_dst0 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_lo), scale256), shift256);
__m256 v_dst1 = _mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(v_src_hi), scale256), shift256);
_mm256_storeu_si256((__m256i *)(dst + x), _mm256_cvtps_epi32(v_dst0));
_mm256_storeu_si256((__m256i *)(dst + x + 8), _mm256_cvtps_epi32(v_dst1));
}
for (; x < width; x++)
dst[x] = saturate_cast<int>(src[x] * scale + shift);
}
}
} // cv::
/* End of file. */
This diff is collapsed.
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
#include "precomp.hpp"
#include "convert.hpp"
namespace cv
{
namespace opt_FP16
{
#if !defined(CV_NEON) || !CV_NEON
const static int cVectorWidth = 8;
void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t dstep, cv::Size size )
{
CV_INSTRUMENT_REGION()
sstep /= sizeof(src[0]);
dstep /= sizeof(dst[0]);
for( ; size.height--; src += sstep, dst += dstep )
{
int x = 0;
for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth )
{
__m256 v_src = _mm256_loadu_ps(src + x);
// round to nearest even
__m128i v_dst = _mm256_cvtps_ph(v_src, 0);
_mm_storeu_si128((__m128i*)(dst + x), v_dst);
}
for ( ; x < size.width; x++ )
{
dst[x] = convertFp16SW(src[x]);
}
}
}
void cvtScaleHalf_SIMD16f32f( const short* src, size_t sstep, float* dst, size_t dstep, cv::Size size )
{
CV_INSTRUMENT_REGION()
sstep /= sizeof(src[0]);
dstep /= sizeof(dst[0]);
for( ; size.height--; src += sstep, dst += dstep )
{
int x = 0;
for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth )
{
__m128i v_src = _mm_loadu_si128((__m128i*)(src + x));
__m256 v_dst = _mm256_cvtph_ps(v_src);
_mm256_storeu_ps(dst + x, v_dst);
}
for ( ; x < size.width; x++ )
{
dst[x] = convertFp16SW(src[x]);
}
}
}
#elif CV_NEON
const static int cVectorWidth = 4;
void cvtScaleHalf_SIMD32f16f( const float* src, size_t sstep, short* dst, size_t dstep, cv::Size size )
{
CV_INSTRUMENT_REGION()
sstep /= sizeof(src[0]);
dstep /= sizeof(dst[0]);
for( ; size.height--; src += sstep, dst += dstep )
{
int x = 0;
for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth)
{
float32x4_t v_src = vld1q_f32(src + x);
float16x4_t v_dst = vcvt_f16_f32(v_src);
cv_vst1_f16(dst + x, v_dst);
}
for ( ; x < size.width; x++ )
{
dst[x] = convertFp16SW(src[x]);
}
}
}
void cvtScaleHalf_SIMD16f32f( const short* src, size_t sstep, float* dst, size_t dstep, cv::Size size )
{
CV_INSTRUMENT_REGION()
sstep /= sizeof(src[0]);
dstep /= sizeof(dst[0]);
for( ; size.height--; src += sstep, dst += dstep )
{
int x = 0;
for ( ; x <= size.width - cVectorWidth ; x += cVectorWidth )
{
float16x4_t v_src = cv_vld1_f16((__fp16*)src + x);
float32x4_t v_dst = vcvt_f32_f16(v_src);
vst1q_f32(dst + x, v_dst);
}
for ( ; x < size.width; x++ )
{
dst[x] = convertFp16SW(src[x]);
}
}
}
#else
#error "Unsupported build configuration"
#endif
}
} // cv::
This diff is collapsed.
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
#include "precomp.hpp"
#include "convert.hpp"
namespace cv
{
namespace opt_SSE4_1
{
int cvtScale_SIMD_u8u16f32_SSE41(const uchar * src, ushort * dst, int width, float scale, float shift)
{
int x = 0;
__m128i v_zero = _mm_setzero_si128();
__m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
for ( ; x <= width - 8; x += 8)
{
__m128i v_src = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i const *)(src + x)), v_zero);
__m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
__m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
__m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
__m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
_mm_cvtps_epi32(v_dst_1));
_mm_storeu_si128((__m128i *)(dst + x), v_dst);
}
return x;
}
int cvtScale_SIMD_s8u16f32_SSE41(const schar * src, ushort * dst, int width, float scale, float shift)
{
int x = 0;
__m128i v_zero = _mm_setzero_si128();
__m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
for ( ; x <= width - 8; x += 8)
{
__m128i v_src = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, _mm_loadl_epi64((__m128i const *)(src + x))), 8);
__m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
__m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
__m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
__m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
_mm_cvtps_epi32(v_dst_1));
_mm_storeu_si128((__m128i *)(dst + x), v_dst);
}
return x;
}
int cvtScale_SIMD_u16u16f32_SSE41(const ushort * src, ushort * dst, int width, float scale, float shift)
{
int x = 0;
__m128i v_zero = _mm_setzero_si128();
__m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
for ( ; x <= width - 8; x += 8)
{
__m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
__m128 v_src_f = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero));
__m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
v_src_f = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero));
__m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
__m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
_mm_cvtps_epi32(v_dst_1));
_mm_storeu_si128((__m128i *)(dst + x), v_dst);
}
return x;
}
int cvtScale_SIMD_s16u16f32_SSE41(const short * src, ushort * dst, int width, float scale, float shift)
{
int x = 0;
__m128i v_zero = _mm_setzero_si128();
__m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
for ( ; x <= width - 8; x += 8)
{
__m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
__m128 v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src), 16));
__m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
v_src_f = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src), 16));
__m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src_f, v_scale), v_shift);
__m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
_mm_cvtps_epi32(v_dst_1));
_mm_storeu_si128((__m128i *)(dst + x), v_dst);
}
return x;
}
int cvtScale_SIMD_s32u16f32_SSE41(const int * src, ushort * dst, int width, float scale, float shift)
{
int x = 0;
__m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
for ( ; x <= width - 8; x += 8)
{
__m128i v_src = _mm_loadu_si128((__m128i const *)(src + x));
__m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
v_src = _mm_loadu_si128((__m128i const *)(src + x + 4));
__m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(v_src), v_scale), v_shift);
__m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
_mm_cvtps_epi32(v_dst_1));
_mm_storeu_si128((__m128i *)(dst + x), v_dst);
}
return x;
}
int cvtScale_SIMD_f32u16f32_SSE41(const float * src, ushort * dst, int width, float scale, float shift)
{
int x = 0;
__m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
for ( ; x <= width - 8; x += 8)
{
__m128 v_src = _mm_loadu_ps(src + x);
__m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
v_src = _mm_loadu_ps(src + x + 4);
__m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
__m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
_mm_cvtps_epi32(v_dst_1));
_mm_storeu_si128((__m128i *)(dst + x), v_dst);
}
return x;
}
int cvtScale_SIMD_f64u16f32_SSE41(const double * src, ushort * dst, int width, float scale, float shift)
{
int x = 0;
__m128 v_scale = _mm_set1_ps(scale), v_shift = _mm_set1_ps(shift);
for ( ; x <= width - 8; x += 8)
{
__m128 v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x)),
_mm_cvtpd_ps(_mm_loadu_pd(src + x + 2)));
__m128 v_dst_0 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
v_src = _mm_movelh_ps(_mm_cvtpd_ps(_mm_loadu_pd(src + x + 4)),
_mm_cvtpd_ps(_mm_loadu_pd(src + x + 6)));
__m128 v_dst_1 = _mm_add_ps(_mm_mul_ps(v_src, v_scale), v_shift);
__m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_dst_0),
_mm_cvtps_epi32(v_dst_1));
_mm_storeu_si128((__m128i *)(dst + x), v_dst);
}
return x;
}
int Cvt_SIMD_f64u16_SSE41(const double * src, ushort * dst, int width)
{
int x = 0;
for ( ; x <= width - 8; x += 8)
{
__m128 v_src0 = _mm_cvtpd_ps(_mm_loadu_pd(src + x));
__m128 v_src1 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 2));
__m128 v_src2 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 4));
__m128 v_src3 = _mm_cvtpd_ps(_mm_loadu_pd(src + x + 6));
v_src0 = _mm_movelh_ps(v_src0, v_src1);
v_src1 = _mm_movelh_ps(v_src2, v_src3);
__m128i v_dst = _mm_packus_epi32(_mm_cvtps_epi32(v_src0),
_mm_cvtps_epi32(v_src1));
_mm_storeu_si128((__m128i *)(dst + x), v_dst);
}
return x;
}
}
} // cv::
/* End of file. */
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -183,7 +183,7 @@ class Builder:
cmakecmd = self.getCMakeArgs(arch, target) + \
(["-DCMAKE_TOOLCHAIN_FILE=%s" % toolchain] if toolchain is not None else [])
if target.lower().startswith("iphoneos"):
cmakecmd.append("-DENABLE_NEON=ON")
cmakecmd.append("-DCPU_BASELINE=NEON;FP16")
cmakecmd.append(self.opencv)
cmakecmd.extend(cmakeargs)
execute(cmakecmd, cwd = builddir)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment