Commit 4a54aa3f authored by Vitaly Tuzov's avatar Vitaly Tuzov

Cleared up deprecated intrinsics for FP16

parent 5c0a98cf
......@@ -431,19 +431,6 @@ inline v_float64x4 v_reinterpret_as_f64(const v_float64x4& a)
inline v_float64x4 v_reinterpret_as_f64(const v_float32x8& a)
{ return v_float64x4(_mm256_castps_pd(a.val)); }
#if CV_FP16
inline v_float32x8 v256_load_fp16_f32(const short* ptr)
{
return v_float32x8(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
}
inline void v_store_fp16(short* ptr, const v_float32x8& a)
{
__m128i fp16_value = _mm256_cvtps_ph(a.val, 0);
_mm_store_si128((__m128i*)ptr, fp16_value);
}
#endif
/* Recombine */
/*#define OPENCV_HAL_IMPL_AVX_COMBINE(_Tpvec, perm) \
inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
......@@ -1400,7 +1387,7 @@ inline v_float32x8 v_cvt_f32(const v_float64x4& a)
inline v_float32x8 v_cvt_f32(const v_float64x4& a, const v_float64x4& b)
{
__m128 af = _mm256_cvtpd_ps(a.val), bf = _mm256_cvtpd_ps(b.val);
return v_float32x8(_mm256_insertf128_ps(_mm256_castps128_ps256(af), bf, 1));
return v_float32x8(_v256_combine(af, bf));
}
inline v_float64x4 v_cvt_f64(const v_int32x8& a)
......@@ -1474,7 +1461,7 @@ inline v_int32x8 v256_lut_pairs(const int* tab, const int* idx)
}
inline v_int32x8 v256_lut_quads(const int* tab, const int* idx)
{
return v_int32x8(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(tab + idx[0]))), _mm_loadu_si128((const __m128i*)(tab + idx[1])), 0x1));
return v_int32x8(_v256_combine(_mm_loadu_si128((const __m128i*)(tab + idx[0])), _mm_loadu_si128((const __m128i*)(tab + idx[1]))));
}
inline v_uint32x8 v256_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut((const int *)tab, idx)); }
inline v_uint32x8 v256_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_pairs((const int *)tab, idx)); }
......@@ -1490,7 +1477,7 @@ inline v_int64x4 v256_lut(const int64* tab, const int* idx)
}
inline v_int64x4 v256_lut_pairs(const int64* tab, const int* idx)
{
return v_int64x4(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(tab + idx[0]))), _mm_loadu_si128((const __m128i*)(tab + idx[1])), 0x1));
return v_int64x4(_v256_combine(_mm_loadu_si128((const __m128i*)(tab + idx[0])), _mm_loadu_si128((const __m128i*)(tab + idx[1]))));
}
inline v_uint64x4 v256_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut((const int64 *)tab, idx)); }
inline v_uint64x4 v256_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut_pairs((const int64 *)tab, idx)); }
......@@ -1506,7 +1493,7 @@ inline v_float64x4 v256_lut(const double* tab, const int* idx)
{
return v_float64x4(_mm256_i32gather_pd(tab, _mm_loadu_si128((const __m128i*)idx), 8));
}
inline v_float64x4 v256_lut_pairs(const double* tab, const int* idx) { return v_float64x4(_mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_loadu_pd(tab + idx[0])), _mm_loadu_pd(tab + idx[1]), 0x1)); }
inline v_float64x4 v256_lut_pairs(const double* tab, const int* idx) { return v_float64x4(_v256_combine(_mm_loadu_pd(tab + idx[0]), _mm_loadu_pd(tab + idx[1]))); }
inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec)
{
......
......@@ -278,48 +278,6 @@ struct v_float64x2
};
#endif
#if CV_FP16
// Workaround for old compilers
static inline int16x4_t vreinterpret_s16_f16(float16x4_t a) { return (int16x4_t)a; }
static inline float16x4_t vreinterpret_f16_s16(int16x4_t a) { return (float16x4_t)a; }
static inline float16x4_t cv_vld1_f16(const void* ptr)
{
#ifndef vld1_f16 // APPLE compiler defines vld1_f16 as macro
return vreinterpret_f16_s16(vld1_s16((const short*)ptr));
#else
return vld1_f16((const __fp16*)ptr);
#endif
}
static inline void cv_vst1_f16(void* ptr, float16x4_t a)
{
#ifndef vst1_f16 // APPLE compiler defines vst1_f16 as macro
vst1_s16((short*)ptr, vreinterpret_s16_f16(a));
#else
vst1_f16((__fp16*)ptr, a);
#endif
}
#ifndef vdup_n_f16
#define vdup_n_f16(v) (float16x4_t){v, v, v, v}
#endif
#endif // CV_FP16
#if CV_FP16
inline v_float32x4 v128_load_fp16_f32(const short* ptr)
{
float16x4_t a = cv_vld1_f16((const __fp16*)ptr);
return v_float32x4(vcvt_f32_f16(a));
}
inline void v_store_fp16(short* ptr, const v_float32x4& a)
{
float16x4_t fp16 = vcvt_f16_f32(a.val);
cv_vst1_f16((short*)ptr, fp16);
}
#endif
#define OPENCV_HAL_IMPL_NEON_INIT(_Tpv, _Tp, suffix) \
inline v_##_Tpv v_setzero_##suffix() { return v_##_Tpv(vdupq_n_##suffix((_Tp)0)); } \
inline v_##_Tpv v_setall_##suffix(_Tp v) { return v_##_Tpv(vdupq_n_##suffix(v)); } \
......
......@@ -2684,19 +2684,6 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
return v_float64x2(_mm_cvtps_pd(_mm_movehl_ps(a.val, a.val)));
}
#if CV_FP16
inline v_float32x4 v128_load_fp16_f32(const short* ptr)
{
return v_float32x4(_mm_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
}
inline void v_store_fp16(short* ptr, const v_float32x4& a)
{
__m128i fp16_value = _mm_cvtps_ph(a.val, 0);
_mm_storel_epi64((__m128i*)ptr, fp16_value);
}
#endif
////////////// Lookup table access ////////////////////
inline v_int8x16 v_lut(const schar* tab, const int* idx)
......@@ -2956,6 +2943,9 @@ inline v_float32x4 v_pack_triplets(const v_float32x4& vec) { return vec; }
inline v_float32x4 v_load_expand(const float16_t* ptr)
{
#if CV_FP16
return v_float32x4(_mm_cvtph_ps(_mm_loadu_si128((const __m128i*)ptr)));
#else
const __m128i z = _mm_setzero_si128(), delta = _mm_set1_epi32(0x38000000);
const __m128i signmask = _mm_set1_epi32(0x80000000), maxexp = _mm_set1_epi32(0x7c000000);
const __m128 deltaf = _mm_castsi128_ps(_mm_set1_epi32(0x38800000));
......@@ -2968,10 +2958,15 @@ inline v_float32x4 v_load_expand(const float16_t* ptr)
__m128i zmask = _mm_cmpeq_epi32(e, z);
__m128i ft = v_select_si128(zmask, zt, t);
return v_float32x4(_mm_castsi128_ps(_mm_or_si128(ft, sign)));
#endif
}
inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
{
#if CV_FP16
__m128i fp16_value = _mm_cvtps_ph(v.val, 0);
_mm_storel_epi64((__m128i*)ptr, fp16_value);
#else
const __m128i signmask = _mm_set1_epi32(0x80000000);
const __m128i rval = _mm_set1_epi32(0x3f000000);
......@@ -2993,6 +2988,7 @@ inline void v_pack_store(float16_t* ptr, const v_float32x4& v)
t = _mm_or_si128(t, sign);
t = _mm_packs_epi32(t, t);
_mm_storel_epi64((__m128i*)ptr, t);
#endif
}
inline void v_cleanup() {}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment