Commit 595fd275 authored by Maksim Shabunin's avatar Maksim Shabunin

Merge pull request #7182 from mself:two_channel_universal_intrinsics

parents d4ae7f32 9678d48e
...@@ -103,7 +103,7 @@ block and to save contents of the register to memory block. ...@@ -103,7 +103,7 @@ block and to save contents of the register to memory block.
These operations allow to reorder or recombine elements in one or multiple vectors. These operations allow to reorder or recombine elements in one or multiple vectors.
- Interleave, deinterleave (3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave - Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand - Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand
- Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u, - Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
@ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store @ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
...@@ -1075,12 +1075,31 @@ v_load_expand_q(const _Tp* ptr) ...@@ -1075,12 +1075,31 @@ v_load_expand_q(const _Tp* ptr)
return c; return c;
} }
/** @brief Load and deinterleave (4 channels) /** @brief Load and deinterleave (2 channels)
Load data from memory deinterleave and store to 4 registers. Load data from memory deinterleave and store to 2 registers.
Scheme: Scheme:
@code @code
{A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} {A1 B1 A2 B2 ...} ==> {A1 A2 ...}, {B1 B2 ...}
@endcode
For all types except 64-bit. */
template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
v_reg<_Tp, n>& b)
{
int i, i2;
for( i = i2 = 0; i < n; i++, i2 += 2 )
{
a.s[i] = ptr[i2];
b.s[i] = ptr[i2+1];
}
}
/** @brief Load and deinterleave (3 channels)
Load data from memory deinterleave and store to 3 registers.
Scheme:
@code
{A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}
@endcode @endcode
For all types except 64-bit. */ For all types except 64-bit. */
template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
...@@ -1095,12 +1114,12 @@ template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_ ...@@ -1095,12 +1114,12 @@ template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_
} }
} }
/** @brief Load and deinterleave (3 channels) /** @brief Load and deinterleave (4 channels)
Load data from memory deinterleave and store to 3 registers. Load data from memory deinterleave and store to 4 registers.
Scheme: Scheme:
@code @code
{A1 B1 C1 A2 B2 C2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} {A1 B1 C1 D1 A2 B2 C2 D2 ...} ==> {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...}
@endcode @endcode
For all types except 64-bit. */ For all types except 64-bit. */
template<typename _Tp, int n> template<typename _Tp, int n>
...@@ -1118,12 +1137,32 @@ inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a, ...@@ -1118,12 +1137,32 @@ inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
} }
} }
/** @brief Interleave and store (2 channels)
Interleave and store data from 2 registers to memory.
Scheme:
@code
{A1 A2 ...}, {B1 B2 ...} ==> {A1 B1 A2 B2 ...}
@endcode
For all types except 64-bit. */
template<typename _Tp, int n>
inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
const v_reg<_Tp, n>& b)
{
int i, i2;
for( i = i2 = 0; i < n; i++, i2 += 2 )
{
ptr[i2] = a.s[i];
ptr[i2+1] = b.s[i];
}
}
/** @brief Interleave and store (3 channels) /** @brief Interleave and store (3 channels)
Interleave and store data from 3 registers to memory. Interleave and store data from 3 registers to memory.
Scheme: Scheme:
@code @code
{A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...}, {D1 D2 ...} ==> {A1 B1 C1 D1 A2 B2 C2 D2 ...} {A1 A2 ...}, {B1 B2 ...}, {C1 C2 ...} ==> {A1 B1 C1 A2 B2 C2 ...}
@endcode @endcode
For all types except 64-bit. */ For all types except 64-bit. */
template<typename _Tp, int n> template<typename _Tp, int n>
......
...@@ -809,6 +809,12 @@ OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32) ...@@ -809,6 +809,12 @@ OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(int32x4, s32)
OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32) OPENCV_HAL_IMPL_NEON_TRANSPOSE4x4(float32x4, f32)
#define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \ #define OPENCV_HAL_IMPL_NEON_INTERLEAVED(_Tpvec, _Tp, suffix) \
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b) \
{ \
_Tpvec##x2_t v = vld2q_##suffix(ptr); \
a.val = v.val[0]; \
b.val = v.val[1]; \
} \
inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, v_##_Tpvec& c) \
{ \ { \
_Tpvec##x3_t v = vld3q_##suffix(ptr); \ _Tpvec##x3_t v = vld3q_##suffix(ptr); \
...@@ -825,6 +831,13 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \ ...@@ -825,6 +831,13 @@ inline void v_load_deinterleave(const _Tp* ptr, v_##_Tpvec& a, v_##_Tpvec& b, \
c.val = v.val[2]; \ c.val = v.val[2]; \
d.val = v.val[3]; \ d.val = v.val[3]; \
} \ } \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b) \
{ \
_Tpvec##x2_t v; \
v.val[0] = a.val; \
v.val[1] = b.val; \
vst2q_##suffix(ptr, v); \
} \
inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \ inline void v_store_interleave( _Tp* ptr, const v_##_Tpvec& a, const v_##_Tpvec& b, const v_##_Tpvec& c) \
{ \ { \
_Tpvec##x3_t v; \ _Tpvec##x3_t v; \
......
...@@ -1374,6 +1374,18 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& ...@@ -1374,6 +1374,18 @@ inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4&
v_transpose4x4(u0, u1, u2, u3, a, b, c, d); v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
} }
// 2-channel, float only
inline void v_load_deinterleave(const float* ptr, v_float32x4& a, v_float32x4& b)
{
const int mask_lo = _MM_SHUFFLE(2, 0, 2, 0), mask_hi = _MM_SHUFFLE(3, 1, 3, 1);
__m128 u0 = _mm_loadu_ps(ptr); // a0 b0 a1 b1
__m128 u1 = _mm_loadu_ps((ptr + 4)); // a2 b2 a3 b3
a.val = _mm_shuffle_ps(u0, u1, mask_lo); // a0 a1 a2 a3
b.val = _mm_shuffle_ps(u0, u1, mask_hi); // b0 b1 ab b3
}
inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b, inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
const v_uint8x16& c ) const v_uint8x16& c )
{ {
...@@ -1529,6 +1541,18 @@ inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint3 ...@@ -1529,6 +1541,18 @@ inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint3
v_store(ptr + 12, t3); v_store(ptr + 12, t3);
} }
// 2-channel, float only
inline void v_store_interleave(float* ptr, const v_float32x4& a, const v_float32x4& b)
{
// a0 a1 a2 a3 ...
// b0 b1 b2 b3 ...
__m128 u0 = _mm_unpacklo_ps(a.val, b.val); // a0 b0 a1 b1
__m128 u1 = _mm_unpackhi_ps(a.val, b.val); // a2 b2 a3 b3
_mm_storeu_ps(ptr, u0);
_mm_storeu_ps((ptr + 4), u1);
}
#define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \ #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \ inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
_Tpvec& b0, _Tpvec& c0 ) \ _Tpvec& b0, _Tpvec& c0 ) \
......
...@@ -132,6 +132,32 @@ template<typename R> struct TheTest ...@@ -132,6 +132,32 @@ template<typename R> struct TheTest
return *this; return *this;
} }
// float32x4 only
TheTest & test_interleave_2channel()
{
Data<R> data1, data2;
data2 += 20;
R a = data1, b = data2;
LaneType buf2[R::nlanes * 2];
v_store_interleave(buf2, a, b);
Data<R> z(0);
a = b = z;
v_load_deinterleave(buf2, a, b);
for (int i = 0; i < R::nlanes; ++i)
{
EXPECT_EQ(data1, Data<R>(a));
EXPECT_EQ(data2, Data<R>(b));
}
return *this;
}
// v_expand and v_load_expand // v_expand and v_load_expand
TheTest & test_expand() TheTest & test_expand()
{ {
...@@ -846,6 +872,7 @@ TEST(hal_intrin, float32x4) { ...@@ -846,6 +872,7 @@ TEST(hal_intrin, float32x4) {
TheTest<v_float32x4>() TheTest<v_float32x4>()
.test_loadstore() .test_loadstore()
.test_interleave() .test_interleave()
.test_interleave_2channel()
.test_addsub() .test_addsub()
.test_mul() .test_mul()
.test_div() .test_div()
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment