Merge pull request #12876 from savuor:color_rgb2rgb_wide

* RGB2RGB initially rewritten * NEON impl removed * templated version added for ushort, float * data copying allowed for RGB2RGB * inplace processing fixed * fields to local vars * no zeroupper until it's fixed * vx_cleanup() added back

Merge pull request #12876 from savuor:color_rgb2rgb_wide
* RGB2RGB initially rewritten * NEON impl removed * templated version added for ushort, float * data copying allowed for RGB2RGB * inplace processing fixed * fields to local vars * no zeroupper until it's fixed * vx_cleanup() added back
fa91d621 · Rostislav Vasilikhin · Alexander Alekhin · 2268ed1b · fa91d621
Commit fa91d621 authored Oct 30, 2018 by Rostislav Vasilikhin Committed by Alexander Alekhin Oct 30, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 77 additions and 142 deletions

color_rgb.cpp modules/imgproc/src/color_rgb.cpp +77 -142

No files found.
--- a/modules/imgproc/src/color_rgb.cpp
+++ b/modules/imgproc/src/color_rgb.cpp
@@ -12,183 +12,118 @@ namespace cv
 ////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
-template<typename _Tp> struct RGB2RGB
+template<typename _Tp> struct v_type;
-{
-    typedef _Tp channel_type;
-    RGB2RGB(int _srccn, int _dstcn, int _blueIdx) : srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx) {}
+template<>
-    void operator()(const _Tp* src, _Tp* dst, int n) const
+struct v_type<uchar>{
+    typedef v_uint8 t;
+};
+template<>
+struct v_type<ushort>{
+    typedef v_uint16 t;
+};
+template<>
+struct v_type<float>{
+    typedef v_float32 t;
+};
+template<typename _Tp> struct v_set;
+template<>
+struct v_set<uchar>
+{
+    static inline v_type<uchar>::t set(uchar x)
    {
-        int scn = srccn, dcn = dstcn, bidx = blueIdx;
+        return vx_setall_u8(x);
-        if( dcn == 3 )
-        {
-            n *= 3;
-            for( int i = 0; i < n; i += 3, src += scn )
-            {
-                _Tp t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
-                dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
-            }
-        }
-        else if( scn == 3 )
-        {
-            n *= 3;
-            _Tp alpha = ColorChannel<_Tp>::max();
-            for( int i = 0; i < n; i += 3, dst += 4 )
-            {
-                _Tp t0 = src[i], t1 = src[i+1], t2 = src[i+2];
-                dst[bidx] = t0; dst[1] = t1; dst[bidx^2] = t2; dst[3] = alpha;
-            }
-        }
-        else
-        {
-            n *= 4;
-            for( int i = 0; i < n; i += 4 )
-            {
-                _Tp t0 = src[i], t1 = src[i+1], t2 = src[i+2], t3 = src[i+3];
-                dst[i+bidx] = t0; dst[i+1] = t1; dst[i+(bidx^2)] = t2; dst[i+3] = t3;
-            }
-        }
    }
+};
-    int srccn, dstcn, blueIdx;
+template<>
+struct v_set<ushort>
+{
+    static inline v_type<ushort>::t set(ushort x)
+    {
+        return vx_setall_u16(x);
+    }
 };
-#if CV_NEON
+template<>
+struct v_set<float>
+{
+    static inline v_type<float>::t set(float x)
+    {
+        return vx_setall_f32(x);
+    }
+};
-template<> struct RGB2RGB<uchar>
+template<typename _Tp>
+struct RGB2RGB
 {
-    typedef uchar channel_type;
+    typedef _Tp channel_type;
+    typedef typename v_type<_Tp>::t vt;
    RGB2RGB(int _srccn, int _dstcn, int _blueIdx) :
        srccn(_srccn), dstcn(_dstcn), blueIdx(_blueIdx)
    {
-        v_alpha = vdupq_n_u8(ColorChannel<uchar>::max());
+        CV_Assert(srccn == 3 || srccn == 4);
-        v_alpha2 = vget_low_u8(v_alpha);
+        CV_Assert(dstcn == 3 || dstcn == 4);
    }
-    void operator()(const uchar * src, uchar * dst, int n) const
+    void operator()(const _Tp* src, _Tp* dst, int n) const
    {
-        int scn = srccn, dcn = dstcn, bidx = blueIdx, i = 0;
+        int scn = srccn, dcn = dstcn, bi = blueIdx;
-        if (dcn == 3)
+        int i = 0;
+        _Tp alphav = ColorChannel<_Tp>::max();
+#if CV_SIMD
+        const int vsize = vt::nlanes;
+        for(; i < n-vsize+1;
+            i += vsize, src += vsize*scn, dst += vsize*dcn)
        {
-            n *= 3;
+            vt a, b, c, d;
-            if (scn == 3)
+            if(scn == 4)
            {
-                for ( ; i <= n - 48; i += 48, src += 48 )
+                v_load_deinterleave(src, a, b, c, d);
-                {
-                    uint8x16x3_t v_src = vld3q_u8(src), v_dst;
-                    v_dst.val[0] = v_src.val[bidx];
-                    v_dst.val[1] = v_src.val[1];
-                    v_dst.val[2] = v_src.val[bidx ^ 2];
-                    vst3q_u8(dst + i, v_dst);
-                }
-                for ( ; i <= n - 24; i += 24, src += 24 )
-                {
-                    uint8x8x3_t v_src = vld3_u8(src), v_dst;
-                    v_dst.val[0] = v_src.val[bidx];
-                    v_dst.val[1] = v_src.val[1];
-                    v_dst.val[2] = v_src.val[bidx ^ 2];
-                    vst3_u8(dst + i, v_dst);
-                }
-                for ( ; i < n; i += 3, src += 3 )
-                {
-                    uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
-                    dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
-                }
            }
            else
            {
-                for ( ; i <= n - 48; i += 48, src += 64 )
+                v_load_deinterleave(src, a, b, c);
-                {
+                d = v_set<_Tp>::set(alphav);
-                    uint8x16x4_t v_src = vld4q_u8(src);
-                    uint8x16x3_t v_dst;
-                    v_dst.val[0] = v_src.val[bidx];
-                    v_dst.val[1] = v_src.val[1];
-                    v_dst.val[2] = v_src.val[bidx ^ 2];
-                    vst3q_u8(dst + i, v_dst);
-                }
-                for ( ; i <= n - 24; i += 24, src += 32 )
-                {
-                    uint8x8x4_t v_src = vld4_u8(src);
-                    uint8x8x3_t v_dst;
-                    v_dst.val[0] = v_src.val[bidx];
-                    v_dst.val[1] = v_src.val[1];
-                    v_dst.val[2] = v_src.val[bidx ^ 2];
-                    vst3_u8(dst + i, v_dst);
-                }
-                for ( ; i < n; i += 3, src += 4 )
-                {
-                    uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
-                    dst[i] = t0; dst[i+1] = t1; dst[i+2] = t2;
-                }
-            }
-        }
-        else if (scn == 3)
-        {
-            n *= 3;
-            for ( ; i <= n - 48; i += 48, dst += 64 )
-            {
-                uint8x16x3_t v_src = vld3q_u8(src + i);
-                uint8x16x4_t v_dst;
-                v_dst.val[bidx] = v_src.val[0];
-                v_dst.val[1] = v_src.val[1];
-                v_dst.val[bidx ^ 2] = v_src.val[2];
-                v_dst.val[3] = v_alpha;
-                vst4q_u8(dst, v_dst);
            }
-            for ( ; i <= n - 24; i += 24, dst += 32 )
+            if(bi == 2)
+                swap(a, c);
+            if(dcn == 4)
            {
-                uint8x8x3_t v_src = vld3_u8(src + i);
+                v_store_interleave(dst, a, b, c, d);
-                uint8x8x4_t v_dst;
-                v_dst.val[bidx] = v_src.val[0];
-                v_dst.val[1] = v_src.val[1];
-                v_dst.val[bidx ^ 2] = v_src.val[2];
-                v_dst.val[3] = v_alpha2;
-                vst4_u8(dst, v_dst);
            }
-            uchar alpha = ColorChannel<uchar>::max();
+            else
-            for (; i < n; i += 3, dst += 4 )
            {
-                uchar t0 = src[i], t1 = src[i+1], t2 = src[i+2];
+                v_store_interleave(dst, a, b, c);
-                dst[bidx] = t0; dst[1] = t1; dst[bidx^2] = t2; dst[3] = alpha;
            }
        }
-        else
+        vx_cleanup();
+#endif
+        for ( ; i < n; i++, src += scn, dst += dcn )
        {
-            n *= 4;
+            _Tp t0 = src[0], t1 = src[1], t2 = src[2];
-            for ( ; i <= n - 64; i += 64 )
+            dst[bi  ] = t0;
-            {
+            dst[1]         = t1;
-                uint8x16x4_t v_src = vld4q_u8(src + i), v_dst;
+            dst[bi^2] = t2;
-                v_dst.val[0] = v_src.val[bidx];
+            if(dcn == 4)
-                v_dst.val[1] = v_src.val[1];
-                v_dst.val[2] = v_src.val[bidx^2];
-                v_dst.val[3] = v_src.val[3];
-                vst4q_u8(dst + i, v_dst);
-            }
-            for ( ; i <= n - 32; i += 32 )
            {
-                uint8x8x4_t v_src = vld4_u8(src + i), v_dst;
+                _Tp d = scn == 4 ? src[3] : alphav;
-                v_dst.val[0] = v_src.val[bidx];
+                dst[3] = d;
-                v_dst.val[1] = v_src.val[1];
-                v_dst.val[2] = v_src.val[bidx^2];
-                v_dst.val[3] = v_src.val[3];
-                vst4_u8(dst + i, v_dst);
-            }
-            for ( ; i < n; i += 4)
-            {
-                uchar t0 = src[i], t1 = src[i+1], t2 = src[i+2], t3 = src[i+3];
-                dst[i+bidx] = t0; dst[i+1] = t1; dst[i+(bidx^2)] = t2; dst[i+3] = t3;
            }
        }
    }
    int srccn, dstcn, blueIdx;
-    uint8x16_t v_alpha;
-    uint8x8_t v_alpha2;
 };
-#endif
 /////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////