improved performance of v_load_deinterleave(8uC3) & v_store_interleave(8uC3)…

improved performance of v_load_deinterleave(8uC3) & v_store_interleave(8uC3) intrinsics when using SSSE3 instructions.

improved performance of v_load_deinterleave(8uC3) & v_store_interleave(8uC3)…
improved performance of v_load_deinterleave(8uC3) & v_store_interleave(8uC3) intrinsics when using SSSE3 instructions.
4001e310 · Vadim Pisarevsky · 9615f8c9 · 4001e310
Commit 4001e310 authored Apr 23, 2018 by Vadim Pisarevsky
Hide whitespace changes
Inline Side-by-side

Showing with 45 additions and 0 deletions

intrin_sse.hpp modules/core/include/opencv2/core/hal/intrin_sse.hpp +45 -0

No files found.
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -1607,6 +1607,28 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b)

 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
 {
+#if CV_SSSE3
+    static const __m128i m0 = _mm_setr_epi8(0, 3, 6, 9, 12, 15, 1, 4, 7, 10, 13, 2, 5, 8, 11, 14);
+    static const __m128i m1 = _mm_alignr_epi8(m0, m0, 11);
+    static const __m128i m2 = _mm_alignr_epi8(m0, m0, 6);
+
+    __m128i t0 = _mm_loadu_si128((const __m128i*)ptr);
+    __m128i t1 = _mm_loadu_si128((const __m128i*)(ptr + 16));
+    __m128i t2 = _mm_loadu_si128((const __m128i*)(ptr + 32));
+
+    __m128i s0 = _mm_shuffle_epi8(t0, m0);
+    __m128i s1 = _mm_shuffle_epi8(t1, m1);
+    __m128i s2 = _mm_shuffle_epi8(t2, m2);
+
+    t0 = _mm_alignr_epi8(s1, _mm_slli_si128(s0, 10), 5);
+    a.val = _mm_alignr_epi8(s2, t0, 5);
+
+    t1 = _mm_alignr_epi8(_mm_srli_si128(s1, 5), _mm_slli_si128(s0, 5), 6);
+    b.val = _mm_alignr_epi8(_mm_srli_si128(s2, 5), t1, 5);
+
+    t2 = _mm_alignr_epi8(_mm_srli_si128(s2, 10), s1, 11);
+    c.val = _mm_alignr_epi8(t2, s0, 11);
+#else
    __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
    __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
    __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32));
@@ -1626,6 +1648,7 @@ inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b,
    a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
    b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
    c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
+#endif
 }

 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
@@ -1840,6 +1863,27 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1
 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
                                const v_uint8x16& c )
 {
+#if CV_SSSE3
+    static const __m128i m0 = _mm_setr_epi8(0, 6, 11, 1, 7, 12, 2, 8, 13, 3, 9, 14, 4, 10, 15, 5);
+    static const __m128i m1 = _mm_setr_epi8(5, 11, 0, 6, 12, 1, 7, 13, 2, 8, 14, 3, 9, 15, 4, 10);
+    static const __m128i m2 = _mm_setr_epi8(10, 0, 5, 11, 1, 6, 12, 2, 7, 13, 3, 8, 14, 4, 9, 15);
+
+    __m128i t0 = _mm_alignr_epi8(b.val, _mm_slli_si128(a.val, 10), 5);
+    t0 = _mm_alignr_epi8(c.val, t0, 5);
+    __m128i s0 = _mm_shuffle_epi8(t0, m0);
+
+    __m128i t1 = _mm_alignr_epi8(_mm_srli_si128(b.val, 5), _mm_slli_si128(a.val, 5), 6);
+    t1 = _mm_alignr_epi8(_mm_srli_si128(c.val, 5), t1, 5);
+    __m128i s1 = _mm_shuffle_epi8(t1, m1);
+
+    __m128i t2 = _mm_alignr_epi8(_mm_srli_si128(c.val, 10), b.val, 11);
+    t2 = _mm_alignr_epi8(t2, a.val, 11);
+    __m128i s2 = _mm_shuffle_epi8(t2, m2);
+
+    _mm_storeu_si128((__m128i*)ptr, s0);
+    _mm_storeu_si128((__m128i*)(ptr + 16), s1);
+    _mm_storeu_si128((__m128i*)(ptr + 32), s2);
+#else
    __m128i z = _mm_setzero_si128();
    __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
    __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
@@ -1881,6 +1925,7 @@ inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x1
    _mm_storeu_si128((__m128i*)(ptr), v0);
    _mm_storeu_si128((__m128i*)(ptr + 16), v1);
    _mm_storeu_si128((__m128i*)(ptr + 32), v2);
+#endif
 }

 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,