Commit 4001e310 authored by Vadim Pisarevsky's avatar Vadim Pisarevsky

improved performance of v_load_deinterleave(8uC3) & v_store_interleave(8uC3)…

improved performance of v_load_deinterleave(8uC3) & v_store_interleave(8uC3) intrinsics when using SSSE3 instructions.
parent 9615f8c9
......@@ -1607,6 +1607,28 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)
inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
{
#if CV_SSSE3
static const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
static const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
static const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
__m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
__m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
__m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
__m128i s0 = _mm_shuffle_epi8(t0, m0);
__m128i s1 = _mm_shuffle_epi8(t1, m1);
__m128i s2 = _mm_shuffle_epi8(t2, m2);
t0 = _mm_alignr_epi8(s1, _mm_slli_si128(s0, 10), 5);
a.val = _mm_alignr_epi8(s2, t0, 5);
t1 = _mm_alignr_epi8(_mm_srli_si128(s1, 5), _mm_slli_si128(s0, 5), 6);
b.val = _mm_alignr_epi8(_mm_srli_si128(s2, 5), t1, 5);
t2 = _mm_alignr_epi8(_mm_srli_si128(s2, 10), s1, 11);
c.val = _mm_alignr_epi8(t2, s0, 11);
#else
__m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
__m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
__m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32));
......@@ -1626,6 +1648,7 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b,
a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
#endif
}
inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
......@@ -1840,6 +1863,27 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1
inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
const v_uint8x16& c )
{
#if CV_SSSE3
static const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
static const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
static const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
__m128i t0 = _mm_alignr_epi8(b.val, _mm_slli_si128(a.val, 10), 5);
t0 = _mm_alignr_epi8(c.val, t0, 5);
__m128i s0 = _mm_shuffle_epi8(t0, m0);
__m128i t1 = _mm_alignr_epi8(_mm_srli_si128(b.val, 5), _mm_slli_si128(a.val, 5), 6);
t1 = _mm_alignr_epi8(_mm_srli_si128(c.val, 5), t1, 5);
__m128i s1 = _mm_shuffle_epi8(t1, m1);
__m128i t2 = _mm_alignr_epi8(_mm_srli_si128(c.val, 10), b.val, 11);
t2 = _mm_alignr_epi8(t2, a.val, 11);
__m128i s2 = _mm_shuffle_epi8(t2, m2);
_mm_storeu_si128((__m128i*)ptr, s0);
_mm_storeu_si128((__m128i*)(ptr + 16), s1);
_mm_storeu_si128((__m128i*)(ptr + 32), s2);
#else
__m128i z = _mm_setzero_si128();
__m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
__m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
......@@ -1881,6 +1925,7 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1
_mm_storeu_si128((__m128i*)(ptr), v0);
_mm_storeu_si128((__m128i*)(ptr + 16), v1);
_mm_storeu_si128((__m128i*)(ptr + 32), v2);
#endif
}
inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment