Commit 334c4d62 authored by Vitaly Tuzov's avatar Vitaly Tuzov Committed by Alexander Alekhin

Merge pull request #13781 from terfendail:warp_wintr

Resize reworked using wide universal intrinsics (#13781)

* Added wide universal intrinsics optimized implementation for 3 channel bit-exact linear resize

* Reworked linear resize using new wide LUT intrinsics

* Fix for VSX intrinsics
parent 8cedc052
......@@ -234,7 +234,12 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
inline vtyp vx_##loadsfx##_low(const typ* ptr) { return prefix##_##loadsfx##_low(ptr); } \
inline vtyp vx_##loadsfx##_halves(const typ* ptr0, const typ* ptr1) { return prefix##_##loadsfx##_halves(ptr0, ptr1); } \
inline void vx_store(typ* ptr, const vtyp& v) { return v_store(ptr, v); } \
inline void vx_store_aligned(typ* ptr, const vtyp& v) { return v_store_aligned(ptr, v); }
inline void vx_store_aligned(typ* ptr, const vtyp& v) { return v_store_aligned(ptr, v); } \
inline vtyp vx_lut(const typ* ptr, const int* idx) { return prefix##_lut(ptr, idx); } \
inline vtyp vx_lut_pairs(const typ* ptr, const int* idx) { return prefix##_lut_pairs(ptr, idx); }
#define CV_INTRIN_DEFINE_WIDE_LUT_QUAD(typ, vtyp, prefix) \
inline vtyp vx_lut_quads(const typ* ptr, const int* idx) { return prefix##_lut_quads(ptr, idx); }
#define CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
inline wtyp vx_load_expand(const typ* ptr) { return prefix##_load_expand(ptr); }
......@@ -244,6 +249,7 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#define CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(typ, vtyp, short_typ, wtyp, qtyp, prefix, loadsfx) \
CV_INTRIN_DEFINE_WIDE_INTRIN(typ, vtyp, short_typ, prefix, loadsfx) \
CV_INTRIN_DEFINE_WIDE_LUT_QUAD(typ, vtyp, prefix) \
CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(typ, wtyp, prefix) \
CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND_Q(typ, qtyp, prefix)
......@@ -251,14 +257,19 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(uchar, v_uint8, u8, v_uint16, v_uint32, prefix, load) \
CV_INTRIN_DEFINE_WIDE_INTRIN_WITH_EXPAND(schar, v_int8, s8, v_int16, v_int32, prefix, load) \
CV_INTRIN_DEFINE_WIDE_INTRIN(ushort, v_uint16, u16, prefix, load) \
CV_INTRIN_DEFINE_WIDE_LUT_QUAD(ushort, v_uint16, prefix) \
CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(ushort, v_uint32, prefix) \
CV_INTRIN_DEFINE_WIDE_INTRIN(short, v_int16, s16, prefix, load) \
CV_INTRIN_DEFINE_WIDE_LUT_QUAD(short, v_int16, prefix) \
CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(short, v_int32, prefix) \
CV_INTRIN_DEFINE_WIDE_INTRIN(int, v_int32, s32, prefix, load) \
CV_INTRIN_DEFINE_WIDE_LUT_QUAD(int, v_int32, prefix) \
CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(int, v_int64, prefix) \
CV_INTRIN_DEFINE_WIDE_INTRIN(unsigned, v_uint32, u32, prefix, load) \
CV_INTRIN_DEFINE_WIDE_LUT_QUAD(unsigned, v_uint32, prefix) \
CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(unsigned, v_uint64, prefix) \
CV_INTRIN_DEFINE_WIDE_INTRIN(float, v_float32, f32, prefix, load) \
CV_INTRIN_DEFINE_WIDE_LUT_QUAD(float, v_float32, prefix) \
CV_INTRIN_DEFINE_WIDE_INTRIN(int64, v_int64, s64, prefix, load) \
CV_INTRIN_DEFINE_WIDE_INTRIN(uint64, v_uint64, u64, prefix, load) \
CV_INTRIN_DEFINE_WIDE_LOAD_EXPAND(float16_t, v_float32, prefix)
......@@ -335,11 +346,11 @@ namespace CV__SIMD_NAMESPACE {
typedef v_uint64x4 v_uint64;
typedef v_int64x4 v_int64;
typedef v_float32x8 v_float32;
CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
#if CV_SIMD256_64F
typedef v_float64x4 v_float64;
#endif
CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v256)
CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v256, load)
#endif
inline void vx_cleanup() { v256_cleanup(); }
} // namespace
using namespace CV__SIMD_NAMESPACE;
......@@ -358,11 +369,9 @@ namespace CV__SIMD_NAMESPACE {
typedef v_uint64x2 v_uint64;
typedef v_int64x2 v_int64;
typedef v_float32x4 v_float32;
#if CV_SIMD128_64F
typedef v_float64x2 v_float64;
#endif
CV_INTRIN_DEFINE_WIDE_INTRIN_ALL_TYPES(v)
#if CV_SIMD128_64F
typedef v_float64x2 v_float64;
CV_INTRIN_DEFINE_WIDE_INTRIN(double, v_float64, f64, v, load)
#endif
inline void vx_cleanup() { v_cleanup(); }
......
......@@ -1417,6 +1417,97 @@ inline v_float64x4 v_cvt_f64_high(const v_float32x8& a)
////////////// Lookup table access ////////////////////
inline v_int8x32 v256_lut(const schar* tab, const int* idx)
{
return v_int8x32(_mm256_setr_epi8(tab[idx[ 0]], tab[idx[ 1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
tab[idx[ 8]], tab[idx[ 9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]],
tab[idx[16]], tab[idx[17]], tab[idx[18]], tab[idx[19]], tab[idx[20]], tab[idx[21]], tab[idx[22]], tab[idx[23]],
tab[idx[24]], tab[idx[25]], tab[idx[26]], tab[idx[27]], tab[idx[28]], tab[idx[29]], tab[idx[30]], tab[idx[31]]));
}
inline v_int8x32 v256_lut_pairs(const schar* tab, const int* idx)
{
return v_int8x32(_mm256_setr_epi16(*(const short*)(tab + idx[ 0]), *(const short*)(tab + idx[ 1]), *(const short*)(tab + idx[ 2]), *(const short*)(tab + idx[ 3]),
*(const short*)(tab + idx[ 4]), *(const short*)(tab + idx[ 5]), *(const short*)(tab + idx[ 6]), *(const short*)(tab + idx[ 7]),
*(const short*)(tab + idx[ 8]), *(const short*)(tab + idx[ 9]), *(const short*)(tab + idx[10]), *(const short*)(tab + idx[11]),
*(const short*)(tab + idx[12]), *(const short*)(tab + idx[13]), *(const short*)(tab + idx[14]), *(const short*)(tab + idx[15])));
}
inline v_int8x32 v256_lut_quads(const schar* tab, const int* idx)
{
return v_int8x32(_mm256_i32gather_epi32((const int*)tab, _mm256_loadu_si256((const __m256i*)idx), 1));
}
inline v_uint8x32 v256_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut((const schar *)tab, idx)); }
inline v_uint8x32 v256_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_pairs((const schar *)tab, idx)); }
inline v_uint8x32 v256_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v256_lut_quads((const schar *)tab, idx)); }
inline v_int16x16 v256_lut(const short* tab, const int* idx)
{
return v_int16x16(_mm256_setr_epi16(tab[idx[0]], tab[idx[1]], tab[idx[ 2]], tab[idx[ 3]], tab[idx[ 4]], tab[idx[ 5]], tab[idx[ 6]], tab[idx[ 7]],
tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]));
}
inline v_int16x16 v256_lut_pairs(const short* tab, const int* idx)
{
return v_int16x16(_mm256_i32gather_epi32((const int*)tab, _mm256_loadu_si256((const __m256i*)idx), 2));
}
inline v_int16x16 v256_lut_quads(const short* tab, const int* idx)
{
#if defined(__GNUC__)
return v_int16x16(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 2));//Looks like intrinsic has wrong definition
#else
return v_int16x16(_mm256_i32gather_epi64((const int64*)tab, _mm_loadu_si128((const __m128i*)idx), 2));
#endif
}
inline v_uint16x16 v256_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut((const short *)tab, idx)); }
inline v_uint16x16 v256_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_pairs((const short *)tab, idx)); }
inline v_uint16x16 v256_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v256_lut_quads((const short *)tab, idx)); }
inline v_int32x8 v256_lut(const int* tab, const int* idx)
{
return v_int32x8(_mm256_i32gather_epi32(tab, _mm256_loadu_si256((const __m256i*)idx), 4));
}
inline v_int32x8 v256_lut_pairs(const int* tab, const int* idx)
{
#if defined(__GNUC__)
return v_int32x8(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 4));
#else
return v_int32x8(_mm256_i32gather_epi64((const int64*)tab, _mm_loadu_si128((const __m128i*)idx), 4));
#endif
}
inline v_int32x8 v256_lut_quads(const int* tab, const int* idx)
{
return v_int32x8(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(tab + idx[0]))), _mm_loadu_si128((const __m128i*)(tab + idx[1])), 0x1));
}
inline v_uint32x8 v256_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut((const int *)tab, idx)); }
inline v_uint32x8 v256_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_pairs((const int *)tab, idx)); }
inline v_uint32x8 v256_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v256_lut_quads((const int *)tab, idx)); }
inline v_int64x4 v256_lut(const int64* tab, const int* idx)
{
#if defined(__GNUC__)
return v_int64x4(_mm256_i32gather_epi64((const long long int*)tab, _mm_loadu_si128((const __m128i*)idx), 8));
#else
return v_int64x4(_mm256_i32gather_epi64(tab, _mm_loadu_si128((const __m128i*)idx), 8));
#endif
}
inline v_int64x4 v256_lut_pairs(const int64* tab, const int* idx)
{
return v_int64x4(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)(tab + idx[0]))), _mm_loadu_si128((const __m128i*)(tab + idx[1])), 0x1));
}
inline v_uint64x4 v256_lut(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut((const int64 *)tab, idx)); }
inline v_uint64x4 v256_lut_pairs(const uint64* tab, const int* idx) { return v_reinterpret_as_u64(v256_lut_pairs((const int64 *)tab, idx)); }
inline v_float32x8 v256_lut(const float* tab, const int* idx)
{
return v_float32x8(_mm256_i32gather_ps(tab, _mm256_loadu_si256((const __m256i*)idx), 4));
}
inline v_float32x8 v256_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_pairs((const int *)tab, idx)); }
inline v_float32x8 v256_lut_quads(const float* tab, const int* idx) { return v_reinterpret_as_f32(v256_lut_quads((const int *)tab, idx)); }
inline v_float64x4 v256_lut(const double* tab, const int* idx)
{
return v_float64x4(_mm256_i32gather_pd(tab, _mm_loadu_si128((const __m128i*)idx), 8));
}
inline v_float64x4 v256_lut_pairs(const double* tab, const int* idx) { return v_float64x4(_mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_loadu_pd(tab + idx[0])), _mm_loadu_pd(tab + idx[1]), 0x1)); }
inline v_int32x8 v_lut(const int* tab, const v_int32x8& idxvec)
{
return v_int32x8(_mm256_i32gather_epi32(tab, idxvec.val, 4));
......@@ -1476,6 +1567,49 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x8& idxvec, v_flo
y = v_float64x4(_mm256_unpackhi_pd(xy02, xy13));
}
inline v_int8x32 v_interleave_pairs(const v_int8x32& vec)
{
return v_int8x32(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0d0e0c0b090a08, 0x0705060403010200, 0x0f0d0e0c0b090a08, 0x0705060403010200)));
}
inline v_uint8x32 v_interleave_pairs(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
inline v_int8x32 v_interleave_quads(const v_int8x32& vec)
{
return v_int8x32(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0b0e0a0d090c08, 0x0703060205010400, 0x0f0b0e0a0d090c08, 0x0703060205010400)));
}
inline v_uint8x32 v_interleave_quads(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
inline v_int16x16 v_interleave_pairs(const v_int16x16& vec)
{
return v_int16x16(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0e0b0a0d0c0908, 0x0706030205040100, 0x0f0e0b0a0d0c0908, 0x0706030205040100)));
}
inline v_uint16x16 v_interleave_pairs(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
inline v_int16x16 v_interleave_quads(const v_int16x16& vec)
{
return v_int16x16(_mm256_shuffle_epi8(vec.val, _mm256_set_epi64x(0x0f0e07060d0c0504, 0x0b0a030209080100, 0x0f0e07060d0c0504, 0x0b0a030209080100)));
}
inline v_uint16x16 v_interleave_quads(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
inline v_int32x8 v_interleave_pairs(const v_int32x8& vec)
{
return v_int32x8(_mm256_shuffle_epi32(vec.val, _MM_SHUFFLE(3, 1, 2, 0)));
}
inline v_uint32x8 v_interleave_pairs(const v_uint32x8& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
inline v_float32x8 v_interleave_pairs(const v_float32x8& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
inline v_int8x32 v_pack_triplets(const v_int8x32& vec)
{
return v_int8x32(_mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(vec.val, _mm256_broadcastsi128_si256(_mm_set_epi64x(0xffffff0f0e0d0c0a, 0x0908060504020100))),
_mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
}
inline v_uint8x32 v_pack_triplets(const v_uint8x32& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
inline v_int16x16 v_pack_triplets(const v_int16x16& vec)
{
return v_int16x16(_mm256_permutevar8x32_epi32(_mm256_shuffle_epi8(vec.val, _mm256_broadcastsi128_si256(_mm_set_epi64x(0xffff0f0e0d0c0b0a, 0x0908050403020100))),
_mm256_set_epi64x(0x0000000700000007, 0x0000000600000005, 0x0000000400000002, 0x0000000100000000)));
}
inline v_uint16x16 v_pack_triplets(const v_uint16x16& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
////////// Matrix operations /////////
inline v_int32x8 v_dotprod(const v_int16x16& a, const v_int16x16& b)
......
......@@ -1799,6 +1799,28 @@ template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
return c;
}
template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut(const _Tp* tab, const int* idx)
{
v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
c.s[i] = tab[idx[i]];
return c;
}
template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_pairs(const _Tp* tab, const int* idx)
{
v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
c.s[i] = tab[idx[i / 2] + i % 2];
return c;
}
template<typename _Tp> inline v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> v_lut_quads(const _Tp* tab, const int* idx)
{
v_reg<_Tp, V_TypeTraits<_Tp>::nlanes128> c;
for (int i = 0; i < V_TypeTraits<_Tp>::nlanes128; i++)
c.s[i] = tab[idx[i / 4] + i % 4];
return c;
}
template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>& idx)
{
v_reg<int, n> c;
......@@ -1807,6 +1829,14 @@ template<int n> inline v_reg<int, n> v_lut(const int* tab, const v_reg<int, n>&
return c;
}
template<int n> inline v_reg<unsigned, n> v_lut(const unsigned* tab, const v_reg<int, n>& idx)
{
v_reg<int, n> c;
for (int i = 0; i < n; i++)
c.s[i] = tab[idx.s[i]];
return c;
}
template<int n> inline v_reg<float, n> v_lut(const float* tab, const v_reg<int, n>& idx)
{
v_reg<float, n> c;
......@@ -1845,6 +1875,49 @@ template<int n> inline void v_lut_deinterleave(const double* tab, const v_reg<in
}
}
template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_pairs(const v_reg<_Tp, n>& vec)
{
v_reg<float, n> c;
for (int i = 0; i < n/4; i++)
{
c.s[4*i ] = vec.s[4*i ];
c.s[4*i+1] = vec.s[4*i+2];
c.s[4*i+2] = vec.s[4*i+1];
c.s[4*i+3] = vec.s[4*i+3];
}
return c;
}
template<typename _Tp, int n> inline v_reg<_Tp, n> v_interleave_quads(const v_reg<_Tp, n>& vec)
{
v_reg<float, n> c;
for (int i = 0; i < n/8; i++)
{
c.s[8*i ] = vec.s[8*i ];
c.s[8*i+1] = vec.s[8*i+4];
c.s[8*i+2] = vec.s[8*i+1];
c.s[8*i+3] = vec.s[8*i+5];
c.s[8*i+4] = vec.s[8*i+2];
c.s[8*i+5] = vec.s[8*i+6];
c.s[8*i+6] = vec.s[8*i+3];
c.s[8*i+7] = vec.s[8*i+7];
}
return c;
}
template<typename _Tp, int n> inline v_reg<_Tp, n> v_pack_triplets(const v_reg<_Tp, n>& vec)
{
v_reg<float, n> c;
int j = 0;
for (int i = 0; i < n/4; i++)
{
c.s[3*i ] = vec.s[4*i ];
c.s[3*i+1] = vec.s[4*i+1];
c.s[3*i+2] = vec.s[4*i+2];
}
return c;
}
/** @brief Transpose 4x4 matrix
Scheme:
......
......@@ -1572,6 +1572,162 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
////////////// Lookup table access ////////////////////
inline v_int8x16 v_lut(const schar* tab, const int* idx)
{
schar CV_DECL_ALIGNED(32) elems[16] =
{
tab[idx[ 0]],
tab[idx[ 1]],
tab[idx[ 2]],
tab[idx[ 3]],
tab[idx[ 4]],
tab[idx[ 5]],
tab[idx[ 6]],
tab[idx[ 7]],
tab[idx[ 8]],
tab[idx[ 9]],
tab[idx[10]],
tab[idx[11]],
tab[idx[12]],
tab[idx[13]],
tab[idx[14]],
tab[idx[15]]
};
return v_int8x16(vld1q_s8(elems));
}
inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
{
short CV_DECL_ALIGNED(32) elems[8] =
{
*(short*)(tab+idx[0]),
*(short*)(tab+idx[1]),
*(short*)(tab+idx[2]),
*(short*)(tab+idx[3]),
*(short*)(tab+idx[4]),
*(short*)(tab+idx[5]),
*(short*)(tab+idx[6]),
*(short*)(tab+idx[7])
};
return v_int8x16(vreinterpretq_s8_s16(vld1q_s16(elems)));
}
inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
{
int CV_DECL_ALIGNED(32) elems[4] =
{
*(int*)(tab + idx[0]),
*(int*)(tab + idx[1]),
*(int*)(tab + idx[2]),
*(int*)(tab + idx[3])
};
return v_int8x16(vreinterpretq_s8_s32(vld1q_s32(elems)));
}
inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((schar*)tab, idx)); }
inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((schar*)tab, idx)); }
inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((schar*)tab, idx)); }
inline v_int16x8 v_lut(const short* tab, const int* idx)
{
short CV_DECL_ALIGNED(32) elems[8] =
{
tab[idx[0]],
tab[idx[1]],
tab[idx[2]],
tab[idx[3]],
tab[idx[4]],
tab[idx[5]],
tab[idx[6]],
tab[idx[7]]
};
return v_int16x8(vld1q_s16(elems));
}
inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
{
int CV_DECL_ALIGNED(32) elems[4] =
{
*(int*)(tab + idx[0]),
*(int*)(tab + idx[1]),
*(int*)(tab + idx[2]),
*(int*)(tab + idx[3])
};
return v_int16x8(vreinterpretq_s16_s32(vld1q_s32(elems)));
}
inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
{
int64 CV_DECL_ALIGNED(32) elems[2] =
{
*(int64*)(tab + idx[0]),
*(int64*)(tab + idx[1])
};
return v_int16x8(vreinterpretq_s16_s64(vld1q_s64(elems)));
}
inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((short*)tab, idx)); }
inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((short*)tab, idx)); }
inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((short*)tab, idx)); }
inline v_int32x4 v_lut(const int* tab, const int* idx)
{
int CV_DECL_ALIGNED(32) elems[4] =
{
tab[idx[0]],
tab[idx[1]],
tab[idx[2]],
tab[idx[3]]
};
return v_int32x4(vld1q_s32(elems));
}
inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
{
int64 CV_DECL_ALIGNED(32) elems[2] =
{
*(int64*)(tab + idx[0]),
*(int64*)(tab + idx[1])
};
return v_int32x4(vreinterpretq_s32_s64(vld1q_s64(elems)));
}
inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
{
return v_int32x4(vld1q_s32(tab + idx[0]));
}
inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((int*)tab, idx)); }
inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((int*)tab, idx)); }
inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((int*)tab, idx)); }
inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
{
return v_int64x2(vcombine_s64(vcreate_s64(tab[idx[0]]), vcreate_s64(tab[idx[1]])));
}
inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
{
return v_int64x2(vld1q_s64(tab + idx[0]));
}
inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
inline v_float32x4 v_lut(const float* tab, const int* idx)
{
float CV_DECL_ALIGNED(32) elems[4] =
{
tab[idx[0]],
tab[idx[1]],
tab[idx[2]],
tab[idx[3]]
};
return v_float32x4(vld1q_f32(elems));
}
inline v_float32x4 v_lut_pairs(const float* tab, const int* idx)
{
uint64 CV_DECL_ALIGNED(32) elems[2] =
{
*(uint64*)(tab + idx[0]),
*(uint64*)(tab + idx[1])
};
return v_float32x4(vreinterpretq_f32_u64(vld1q_u64(elems)));
}
inline v_float32x4 v_lut_quads(const float* tab, const int* idx)
{
return v_float32x4(vld1q_f32(tab + idx[0]));
}
inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
{
int CV_DECL_ALIGNED(32) elems[4] =
......@@ -1584,6 +1740,18 @@ inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
return v_int32x4(vld1q_s32(elems));
}
inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
{
unsigned CV_DECL_ALIGNED(32) elems[4] =
{
tab[vgetq_lane_s32(idxvec.val, 0)],
tab[vgetq_lane_s32(idxvec.val, 1)],
tab[vgetq_lane_s32(idxvec.val, 2)],
tab[vgetq_lane_s32(idxvec.val, 3)]
};
return v_uint32x4(vld1q_u32(elems));
}
inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
{
float CV_DECL_ALIGNED(32) elems[4] =
......@@ -1614,7 +1782,64 @@ inline void v_lut_deinterleave(const float* tab, const v_int32x4& idxvec, v_floa
y = v_float32x4(tab[idx[0]+1], tab[idx[1]+1], tab[idx[2]+1], tab[idx[3]+1]);
}
inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
{
return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0705060403010200)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0705060403010200))));
}
inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
{
return v_int8x16(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0703060205010400)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0703060205010400))));
}
inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
{
return v_int16x8(vreinterpretq_s16_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)), vtbl1_s8(vget_high_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0706030205040100)))));
}
inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
{
return v_int16x8(vreinterpretq_s16_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0b0a030209080100)), vtbl1_s8(vget_high_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0b0a030209080100)))));
}
inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
{
int32x2x2_t res = vzip_s32(vget_low_s32(vec.val), vget_high_s32(vec.val));
return v_int32x4(vcombine_s32(res.val[0], res.val[1]));
}
inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
{
return v_int8x16(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vec.val), vcreate_s8(0x0605040201000000)), vtbl1_s8(vget_high_s8(vec.val), vcreate_s8(0x0807060504020100))), vdupq_n_s8(0), 2));
}
inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
{
return v_int16x8(vreinterpretq_s16_s8(vextq_s8(vcombine_s8(vtbl1_s8(vget_low_s8(vreinterpretq_s8_s16(vec.val)), vcreate_s8(0x0504030201000000)), vget_high_s8(vreinterpretq_s8_s16(vec.val))), vdupq_n_s8(0), 2)));
}
inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
#if CV_SIMD128_64F
inline v_float64x2 v_lut(const double* tab, const int* idx)
{
double CV_DECL_ALIGNED(32) elems[2] =
{
tab[idx[0]],
tab[idx[1]]
};
return v_float64x2(vld1q_f64(elems));
}
inline v_float64x2 v_lut_pairs(const double* tab, const int* idx)
{
return v_float64x2(vld1q_f64(tab + idx[0]));
}
inline v_float64x2 v_lut(const double* tab, const v_int32x4& idxvec)
{
double CV_DECL_ALIGNED(32) elems[2] =
......
......@@ -993,6 +993,80 @@ inline v_float64x2 v_cvt_f64_high(const v_float32x4& a)
////////////// Lookup table access ////////////////////
inline v_int8x16 v_lut(const schar* tab, const int* idx)
{
return v_int8x16(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]],
tab[idx[8]], tab[idx[9]], tab[idx[10]], tab[idx[11]], tab[idx[12]], tab[idx[13]], tab[idx[14]], tab[idx[15]]);
}
inline v_int8x16 v_lut_pairs(const schar* tab, const int* idx)
{
return v_reinterpret_as_s8(v_int16x8(*(const short*)(tab+idx[0]), *(const short*)(tab+idx[1]), *(const short*)(tab+idx[2]), *(const short*)(tab+idx[3]),
*(const short*)(tab+idx[4]), *(const short*)(tab+idx[5]), *(const short*)(tab+idx[6]), *(const short*)(tab+idx[7])));
}
inline v_int8x16 v_lut_quads(const schar* tab, const int* idx)
{
return v_reinterpret_as_s8(v_int32x4(*(const int*)(tab+idx[0]), *(const int*)(tab+idx[1]), *(const int*)(tab+idx[2]), *(const int*)(tab+idx[3])));
}
inline v_uint8x16 v_lut(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut((const schar*)tab, idx)); }
inline v_uint8x16 v_lut_pairs(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_pairs((const schar*)tab, idx)); }
inline v_uint8x16 v_lut_quads(const uchar* tab, const int* idx) { return v_reinterpret_as_u8(v_lut_quads((const schar*)tab, idx)); }
inline v_int16x8 v_lut(const short* tab, const int* idx)
{
return v_int16x8(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]], tab[idx[4]], tab[idx[5]], tab[idx[6]], tab[idx[7]]);
}
inline v_int16x8 v_lut_pairs(const short* tab, const int* idx)
{
return v_reinterpret_as_s16(v_int32x4(*(const int*)(tab + idx[0]), *(const int*)(tab + idx[1]), *(const int*)(tab + idx[2]), *(const int*)(tab + idx[3])));
}
inline v_int16x8 v_lut_quads(const short* tab, const int* idx)
{
return v_reinterpret_as_s16(v_int64x2(*(const int64*)(tab + idx[0]), *(const int64*)(tab + idx[1])));
}
inline v_uint16x8 v_lut(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut((const short*)tab, idx)); }
inline v_uint16x8 v_lut_pairs(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_pairs((const short*)tab, idx)); }
inline v_uint16x8 v_lut_quads(const ushort* tab, const int* idx) { return v_reinterpret_as_u16(v_lut_quads((const short*)tab, idx)); }
inline v_int32x4 v_lut(const int* tab, const int* idx)
{
return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
}
inline v_int32x4 v_lut_pairs(const int* tab, const int* idx)
{
return v_reinterpret_as_s32(v_int64x2(*(const int64*)(tab + idx[0]), *(const int64*)(tab + idx[1])));
}
inline v_int32x4 v_lut_quads(const int* tab, const int* idx)
{
return v_int32x4(vsx_ld(0, tab + idx[0]));
}
inline v_uint32x4 v_lut(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut((const int*)tab, idx)); }
inline v_uint32x4 v_lut_pairs(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_pairs((const int*)tab, idx)); }
inline v_uint32x4 v_lut_quads(const unsigned* tab, const int* idx) { return v_reinterpret_as_u32(v_lut_quads((const int*)tab, idx)); }
inline v_int64x2 v_lut(const int64_t* tab, const int* idx)
{
return v_int64x2(tab[idx[0]], tab[idx[1]]);
}
inline v_int64x2 v_lut_pairs(const int64_t* tab, const int* idx)
{
return v_int64x2(vsx_ld2(0, tab + idx[0]));
}
inline v_uint64x2 v_lut(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut((const int64_t *)tab, idx)); }
inline v_uint64x2 v_lut_pairs(const uint64_t* tab, const int* idx) { return v_reinterpret_as_u64(v_lut_pairs((const int64_t *)tab, idx)); }
inline v_float32x4 v_lut(const float* tab, const int* idx)
{
return v_float32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
}
inline v_float32x4 v_lut_pairs(const float* tab, const int* idx) { return v_reinterpret_as_f32(v_lut_pairs((const int*)tab, idx)); }
inline v_float32x4 v_lut_quads(const float* tab, const int* idx) { return v_load(tab + *idx); }
inline v_float64x2 v_lut(const double* tab, const int* idx)
{
return v_float64x2(tab[idx[0]], tab[idx[1]]);
}
inline v_float64x2 v_lut_pairs(const double* tab, const int* idx) { return v_load(tab + *idx); }
inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
{
int CV_DECL_ALIGNED(32) idx[4];
......@@ -1000,6 +1074,13 @@ inline v_int32x4 v_lut(const int* tab, const v_int32x4& idxvec)
return v_int32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
}
inline v_uint32x4 v_lut(const unsigned* tab, const v_int32x4& idxvec)
{
int CV_DECL_ALIGNED(32) idx[4];
v_store_aligned(idx, idxvec);
return v_uint32x4(tab[idx[0]], tab[idx[1]], tab[idx[2]], tab[idx[3]]);
}
inline v_float32x4 v_lut(const float* tab, const v_int32x4& idxvec)
{
int CV_DECL_ALIGNED(32) idx[4];
......@@ -1030,6 +1111,55 @@ inline void v_lut_deinterleave(const double* tab, const v_int32x4& idxvec, v_flo
y = v_float64x2(tab[idx[0]+1], tab[idx[1]+1]);
}
inline v_int8x16 v_interleave_pairs(const v_int8x16& vec)
{
vec_short8 vec0 = vec_mergeh((vec_short8)vec.val, (vec_short8)vec_mergesql(vec.val, vec.val));
vec0 = vec_mergeh(vec0, vec_mergesql(vec0, vec0));
return v_int8x16(vec_mergeh((vec_char16)vec0, (vec_char16)vec_mergesql(vec0, vec0)));
}
inline v_uint8x16 v_interleave_pairs(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_pairs(v_reinterpret_as_s8(vec))); }
inline v_int8x16 v_interleave_quads(const v_int8x16& vec)
{
vec_char16 vec0 = (vec_char16)vec_mergeh((vec_int4)vec.val, (vec_int4)vec_mergesql(vec.val, vec.val));
return v_int8x16(vec_mergeh(vec0, vec_mergesql(vec0, vec0)));
}
inline v_uint8x16 v_interleave_quads(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_interleave_quads(v_reinterpret_as_s8(vec))); }
inline v_int16x8 v_interleave_pairs(const v_int16x8& vec)
{
vec_short8 vec0 = (vec_short8)vec_mergeh((vec_int4)vec.val, (vec_int4)vec_mergesql(vec.val, vec.val));
return v_int16x8(vec_mergeh(vec0, vec_mergesql(vec0, vec0)));
}
inline v_uint16x8 v_interleave_pairs(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_pairs(v_reinterpret_as_s16(vec))); }
inline v_int16x8 v_interleave_quads(const v_int16x8& vec)
{
return v_int16x8(vec_mergeh(vec.val, vec_mergesql(vec.val, vec.val)));
}
inline v_uint16x8 v_interleave_quads(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_interleave_quads(v_reinterpret_as_s16(vec))); }
inline v_int32x4 v_interleave_pairs(const v_int32x4& vec)
{
return v_int32x4(vec_mergeh(vec.val, vec_mergesql(vec.val, vec.val)));
}
inline v_uint32x4 v_interleave_pairs(const v_uint32x4& vec) { return v_reinterpret_as_u32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
inline v_float32x4 v_interleave_pairs(const v_float32x4& vec) { return v_reinterpret_as_f32(v_interleave_pairs(v_reinterpret_as_s32(vec))); }
inline v_int8x16 v_pack_triplets(const v_int8x16& vec)
{
schar CV_DECL_ALIGNED(32) val[16];
v_store_aligned(val, vec);
return v_int8x16(val[0], val[1], val[2], val[4], val[5], val[6], val[8], val[9], val[10], val[12], val[13], val[14], val[15], val[15], val[15], val[15]);
}
inline v_uint8x16 v_pack_triplets(const v_uint8x16& vec) { return v_reinterpret_as_u8(v_pack_triplets(v_reinterpret_as_s8(vec))); }
inline v_int16x8 v_pack_triplets(const v_int16x8& vec)
{
short CV_DECL_ALIGNED(32) val[8];
v_store_aligned(val, vec);
return v_int16x8(val[0], val[1], val[2], val[4], val[5], val[6], val[7], val[7]);
}
inline v_uint16x8 v_pack_triplets(const v_uint16x8& vec) { return v_reinterpret_as_u16(v_pack_triplets(v_reinterpret_as_s16(vec))); }
/////// FP16 support ////////
// [TODO] implement these 2 using VSX or universal intrinsics (copy from intrin_sse.cpp and adopt)
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment