Commit 9678d48e authored by Matthew Self's avatar Matthew Self

2-channel interleaved load/store for universal intrinsics (float only)

* Added 2-channel ops to match existing 3-channel and 4-channel ops

* v_load_deinterleave() and v_store_interleave()

* Implements float32x4 only on SSE (but all types on NEON and CPP)

* Includes tests

* Will be used to vectorize 2D functions, such as estimateAffine2D()
parent 40b87070
...@@ -103,7 +103,7 @@ block and to save contents of the register to memory block. ...@@ -103,7 +103,7 @@ block and to save contents of the register to memory block.
These operations allow to reorder or recombine elements in one or multiple vectors. These operations allow to reorder or recombine elements in one or multiple vectors.
- Interleave, deinterleave (3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave - Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand - Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand
- Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u, - Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
@ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
...@@ -1075,12 +1075,31 @@ v_load_expand_q(const _Tp* ptr) ...@@ -1075,12 +1075,31 @@ v_load_expand_q(const _Tp* ptr)
return c; return c;
} }
/** @brief Load and deinterleave (4 channels) /** @brief Load and deinterleave (2 channels)
Load data from memory deinterleave and store to 4 registers. Load data from memory deinterleave and store to 2 registers.
Scheme: Scheme:
@code @code
{A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} {A1 B1 A2 B2 ...} ==> {A1 A2 ...}, {B1 B2 ...}
@endcode
For all types except 64-bit. */
template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
v_reg<_Tp, n>& b)
{
int i, i2;
for( i = i2 = 0; i < n; i++, i2 += 2 )
{
a.s[i] = ptr[i2];
b.s[i] = ptr[i2+1];
}
}
/** @brief Load and deinterleave (3 channels)
Load data from memory deinterleave and store to 3 registers.
Scheme:
@code
{A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
@endcode @endcode
For all types except 64-bit. */ For all types except 64-bit. */
template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
...@@ -1095,12 +1114,12 @@ template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_ ...@@ -1095,12 +1114,12 @@ template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_
} }
} }
/** @brief Load and deinterleave (3 channels) /** @brief Load and deinterleave (4 channels)
Load data from memory deinterleave and store to 3 registers. Load data from memory deinterleave and store to 4 registers.
Scheme: Scheme:
@code @code
{A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} {A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
@endcode @endcode
For all types except 64-bit. */ For all types except 64-bit. */
template<typename _Tp, int n> template<typename _Tp, int n>
...@@ -1118,12 +1137,32 @@ inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, ...@@ -1118,12 +1137,32 @@ inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
} }
} }
/** @brief Interleave and store (2 channels)
Interleave and store data from 2 registers to memory.
Scheme:
@code
{A1 A2 ...}, {B1 B2 ...} ==> {A1 B1 A2 B2 ...}
@endcode
For all types except 64-bit. */
template<typename _Tp, int n>
inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
const v_reg<_Tp, n>& b)
{
int i, i2;
for( i = i2 = 0; i < n; i++, i2 += 2 )
{
ptr[i2] = a.s[i];
ptr[i2+1] = b.s[i];
}
}
/** @brief Interleave and store (3 channels) /** @brief Interleave and store (3 channels)
Interleave and store data from 3 registers to memory. Interleave and store data from 3 registers to memory.
Scheme: Scheme:
@code @code
{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...} {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} ==> {A1 B1 C1 A2 B2 C2 ...}
@endcode @endcode
For all types except 64-bit. */ For all types except 64-bit. */
template<typename _Tp, int n> template<typename _Tp, int n>
......
...@@ -809,6 +809,12 @@ OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32) ...@@ -809,6 +809,12 @@ OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32) OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \ #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
{ \
_Tpvec##x2_t v = vld2q_##suffix(ptr); \
a.val = v.val[0]; \
b.val = v.val[1]; \
} \
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
{ \ { \
_Tpvec##x3_t v = vld3q_##suffix(ptr); \ _Tpvec##x3_t v = vld3q_##suffix(ptr); \
...@@ -825,6 +831,13 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \ ...@@ -825,6 +831,13 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
c.val = v.val[2]; \ c.val = v.val[2]; \
d.val = v.val[3]; \ d.val = v.val[3]; \
} \ } \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b) \
{ \
_Tpvec##x2_t v; \
v.val[0] = a.val; \
v.val[1] = b.val; \
vst2q_##suffix(ptr, v); \
} \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
{ \ { \
_Tpvec##x3_t v; \ _Tpvec##x3_t v; \
......
...@@ -1374,6 +1374,18 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& ...@@ -1374,6 +1374,18 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4&
v_transpose4x4(u0, u1, u2, u3, a, b, c, d); v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
} }
// 2-channel, float only
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
{
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
__m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1
__m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
}
inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
const v_uint8x16& c ) const v_uint8x16& c )
{ {
...@@ -1529,6 +1541,18 @@ inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint3 ...@@ -1529,6 +1541,18 @@ inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint3
v_store(ptr + 12, t3); v_store(ptr + 12, t3);
} }
// 2-channel, float only
inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b)
{
// a0 a1 a2 a3 ...
// b0 b1 b2 b3 ...
__m128 u0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
__m128 u1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
_mm_storeu_ps(ptr, u0);
_mm_storeu_ps((ptr + 4), u1);
}
#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \ #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \ inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
_Tpvec& b0, _Tpvec& c0 ) \ _Tpvec& b0, _Tpvec& c0 ) \
......
...@@ -132,6 +132,32 @@ template<typename R> struct TheTest ...@@ -132,6 +132,32 @@ template<typename R> struct TheTest
return *this; return *this;
} }
// float32x4 only
TheTest & test_interleave_2channel()
{
Data<R> data1, data2;
data2 += 20;
R a = data1, b = data2;
LaneType buf2[R::nlanes * 2];
v_store_interleave(buf2, a, b);
Data<R> z(0);
a = b = z;
v_load_deinterleave(buf2, a, b);
for (int i = 0; i < R::nlanes; ++i)
{
EXPECT_EQ(data1, Data<R>(a));
EXPECT_EQ(data2, Data<R>(b));
}
return *this;
}
// v_expand and v_load_expand // v_expand and v_load_expand
TheTest & test_expand() TheTest & test_expand()
{ {
...@@ -846,6 +872,7 @@ TEST(hal_intrin, float32x4) { ...@@ -846,6 +872,7 @@ TEST(hal_intrin, float32x4) {
TheTest<v_float32x4>() TheTest<v_float32x4>()
.test_loadstore() .test_loadstore()
.test_interleave() .test_interleave()
.test_interleave_2channel()
.test_addsub() .test_addsub()
.test_mul() .test_mul()
.test_div() .test_div()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment