fixed cvtColorGPU on linux

fea66d93 · Andrey Morozov · 2a0909ac · fea66d93
Commit fea66d93 authored Aug 30, 2010 by Andrey Morozov
Hide whitespace changes
Inline Side-by-side

Showing with 371 additions and 372 deletions

color.cu modules/gpu/src/cuda/color.cu +371 -372

No files found.
--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@@ -51,7 +51,7 @@ using namespace cv::gpu::impl;
 #endif

 namespace imgproc
-{    
+{
    template<typename _Tp> struct ColorChannel
    {
    };
@@ -65,7 +65,7 @@ namespace imgproc
        static __device__ unsigned char half() { return (unsigned char)(max()/2 + 1); }
    };

-    template<> struct ColorChannel<ushort>
+    template<> struct ColorChannel<unsigned short>
    {
        typedef float worktype_f;
        typedef ushort3 vec3_t;
@@ -89,24 +89,24 @@ namespace imgproc
 namespace imgproc
 {
    template <typename T>
-	__global__ void RGB2RGB_3_3(const T* src_, size_t src_step, T* dst_, size_t dst_step, int rows, int cols, int bidx)
-	{
-		const int x = blockDim.x * blockIdx.x + threadIdx.x;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        __global__ void RGB2RGB_3_3(const T* src_, size_t src_step, T* dst_, size_t dst_step, int rows, int cols, int bidx)
+        {
+                const int x = blockDim.x * blockIdx.x + threadIdx.x;
+                const int y = blockDim.y * blockIdx.y + threadIdx.y;

        if (y < rows && x < cols)
        {
            const T* src = src_ + y * src_step + x * 3;
            T* dst = dst_ + y * dst_step + x * 3;
-						
+
            T t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
            dst[0] = t0; dst[1] = t1; dst[2] = t2;
        }
-	}
+        }

    template <typename T>
-	__global__ void RGB2RGB_4_3(const T* src_, size_t src_step, T* dst_, size_t dst_step, int rows, int cols, int bidx)
-	{
+        __global__ void RGB2RGB_4_3(const T* src_, size_t src_step, T* dst_, size_t dst_step, int rows, int cols, int bidx)
+        {
        typedef typename ColorChannel<T>::vec4_t vec4_t;

 		const int x = blockDim.x * blockIdx.x + threadIdx.x;
@@ -116,15 +116,15 @@ namespace imgproc
        {
            vec4_t src = *(vec4_t*)(src_ + y * src_step + (x << 2));
            T* dst = dst_ + y * dst_step + x * 3;
-						
+
            T t0 = ((T*)(&src))[bidx], t1 = src.y, t2 = ((T*)(&src))[bidx ^ 2];
            dst[0] = t0; dst[1] = t1; dst[2] = t2;
        }
-	}
+        }

    template <typename T>
-	__global__ void RGB2RGB_3_4(const T* src_, size_t src_step, T* dst_, size_t dst_step, int rows, int cols, int bidx)
-	{
+        __global__ void RGB2RGB_3_4(const T* src_, size_t src_step, T* dst_, size_t dst_step, int rows, int cols, int bidx)
+        {
        typedef typename ColorChannel<T>::vec4_t vec4_t;

 		const int x = blockDim.x * blockIdx.x + threadIdx.x;
@@ -135,18 +135,18 @@ namespace imgproc
            const T* src = src_ + y * src_step + x * 3;

            vec4_t dst;
-						
+
            dst.x = src[bidx];
            dst.y = src[1];
            dst.z = src[bidx ^ 2];
            dst.w = ColorChannel<T>::max();
            *(vec4_t*)(dst_ + y * dst_step + (x << 2)) = dst;
        }
-	}
+        }

    template <typename T>
-	__global__ void RGB2RGB_4_4(const T* src_, size_t src_step, T* dst_, size_t dst_step, int rows, int cols, int bidx)
-	{
+        __global__ void RGB2RGB_4_4(const T* src_, size_t src_step, T* dst_, size_t dst_step, int rows, int cols, int bidx)
+        {
        typedef typename ColorChannel<T>::vec4_t vec4_t;

 		const int x = blockDim.x * blockIdx.x + threadIdx.x;
@@ -164,14 +164,14 @@ namespace imgproc

            *(vec4_t*)(dst_ + y * dst_step + (x << 2)) = dst;
        }
-	}
+        }
 }

 namespace cv { namespace gpu { namespace impl
 {
    template <typename T>
    void RGB2RGB_caller(const DevMem2D_<T>& src, int srccn, const DevMem2D_<T>& dst, int dstcn, int bidx, cudaStream_t stream)
-    {        
+    {
        dim3 threads(32, 8, 1);
        dim3 grid(1, 1, 1);

@@ -180,15 +180,15 @@ namespace cv { namespace gpu { namespace impl

        switch (dstcn)
        {
-        case 3: 
+        case 3:
            switch (srccn)
            {
            case 3:
-                imgproc::RGB2RGB_3_3<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(T), dst.ptr, dst.step / sizeof(T), 
+                imgproc::RGB2RGB_3_3<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(T), dst.ptr, dst.step / sizeof(T),
                                                                          src.rows, src.cols, bidx);
                break;
            case 4:
-                imgproc::RGB2RGB_4_3<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(T), dst.ptr, dst.step / sizeof(T), 
+                imgproc::RGB2RGB_4_3<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(T), dst.ptr, dst.step / sizeof(T),
                                                                          src.rows, src.cols, bidx);
                break;
            default:
@@ -196,15 +196,15 @@ namespace cv { namespace gpu { namespace impl
                break;
            }
            break;
-        case 4: 
+        case 4:
            switch (srccn)
            {
            case 3:
-                imgproc::RGB2RGB_3_4<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(T), dst.ptr, dst.step / sizeof(T), 
+                imgproc::RGB2RGB_3_4<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(T), dst.ptr, dst.step / sizeof(T),
                                                                          src.rows, src.cols, bidx);
                break;
            case 4:
-                imgproc::RGB2RGB_4_4<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(T), dst.ptr, dst.step / sizeof(T), 
+                imgproc::RGB2RGB_4_4<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(T), dst.ptr, dst.step / sizeof(T),
                                                                          src.rows, src.cols, bidx);
                break;
            default:
@@ -226,7 +226,7 @@ namespace cv { namespace gpu { namespace impl
        RGB2RGB_caller(src, srccn, dst, dstcn, bidx, stream);
    }

-    void RGB2RGB_gpu(const DevMem2D_<ushort>& src, int srccn, const DevMem2D_<ushort>& dst, int dstcn, int bidx, cudaStream_t stream)
+    void RGB2RGB_gpu(const DevMem2D_<unsigned short>& src, int srccn, const DevMem2D_<unsigned short>& dst, int dstcn, int bidx, cudaStream_t stream)
    {
        RGB2RGB_caller(src, srccn, dst, dstcn, bidx, stream);
    }
@@ -236,7 +236,7 @@ namespace cv { namespace gpu { namespace impl
        RGB2RGB_caller(src, srccn, dst, dstcn, bidx, stream);
    }
 }}}
-    
+
 /////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////

 //namespace imgproc
@@ -244,17 +244,17 @@ namespace cv { namespace gpu { namespace impl
 //    struct RGB5x52RGB
 //    {
 //        typedef uchar channel_type;
-//        
+//
 //        RGB5x52RGB(int _dstcn, int _blueIdx, int _greenBits)
 //		    : dstcn(_dstcn), blueIdx(_blueIdx), greenBits(_greenBits) {}
-//    		
+//
 //        void operator()(const uchar* src, uchar* dst, int n) const
 //        {
 //            int dcn = dstcn, bidx = blueIdx;
 //            if( greenBits == 6 )
 //                for( int i = 0; i < n; i++, dst += dcn )
 //                {
-//                    unsigned t = ((const ushort*)src)[i];
+//                    unsigned t = ((const unsigned short*)src)[i];
 //                    dst[bidx] = (uchar)(t << 3);
 //                    dst[1] = (uchar)((t >> 3) & ~3);
 //                    dst[bidx ^ 2] = (uchar)((t >> 8) & ~7);
@@ -264,7 +264,7 @@ namespace cv { namespace gpu { namespace impl
 //            else
 //                for( int i = 0; i < n; i++, dst += dcn )
 //                {
-//                    unsigned t = ((const ushort*)src)[i];
+//                    unsigned t = ((const unsigned short*)src)[i];
 //                    dst[bidx] = (uchar)(t << 3);
 //                    dst[1] = (uchar)((t >> 2) & ~7);
 //                    dst[bidx ^ 2] = (uchar)((t >> 7) & ~7);
@@ -272,39 +272,39 @@ namespace cv { namespace gpu { namespace impl
 //                        dst[3] = t & 0x8000 ? 255 : 0;
 //                }
 //        }
-//        
+//
 //        int dstcn, blueIdx, greenBits;
 //    };
 //
-//        
+//
 //    struct RGB2RGB5x5
 //    {
 //        typedef uchar channel_type;
-//        
+//
 //        RGB2RGB5x5(int _srccn, int _blueIdx, int _greenBits)
 //		    : srccn(_srccn), blueIdx(_blueIdx), greenBits(_greenBits) {}
-//    		
+//
 //        void operator()(const uchar* src, uchar* dst, int n) const
 //        {
 //            int scn = srccn, bidx = blueIdx;
 //            if( greenBits == 6 )
 //                for( int i = 0; i < n; i++, src += scn )
 //                {
-//                    ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8));
+//                    ((unsigned short*)dst)[i] = (unsigned short)((src[bidx] >> 3)|((src[1]&~3) << 3)|((src[bidx^2]&~7) << 8));
 //                }
 //            else if( scn == 3 )
 //                for( int i = 0; i < n; i++, src += 3 )
 //                {
-//                    ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|((src[bidx^2]&~7) << 7));
+//                    ((unsigned short*)dst)[i] = (unsigned short)((src[bidx] >> 3)|((src[1]&~7) << 2)|((src[bidx^2]&~7) << 7));
 //                }
 //            else
 //                for( int i = 0; i < n; i++, src += 4 )
 //                {
-//                    ((ushort*)dst)[i] = (ushort)((src[bidx] >> 3)|((src[1]&~7) << 2)|
+//                    ((unsigned short*)dst)[i] = (unsigned short)((src[bidx] >> 3)|((src[1]&~7) << 2)|
 //                        ((src[bidx^2]&~7) << 7)|(src[3] ? 0x8000 : 0));
 //                }
 //        }
-//        
+//
 //        int srccn, blueIdx, greenBits;
 //    };
 //}
@@ -320,8 +320,8 @@ namespace imgproc
    template <typename T>
    __global__ void Gray2RGB_3(const T* src_, size_t src_step, T* dst_, size_t dst_step, int rows, int cols)
    {
-		const int x = blockDim.x * blockIdx.x + threadIdx.x;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
+                const int x = blockDim.x * blockIdx.x + threadIdx.x;
+                const int y = blockDim.y * blockIdx.y + threadIdx.y;

        if (y < rows && x < cols)
        {
@@ -356,7 +356,7 @@ namespace imgproc
    //struct Gray2RGB5x5
    //{
    //    typedef uchar channel_type;
-    //    
+    //
    //    Gray2RGB5x5(int _greenBits) : greenBits(_greenBits) {}
    //    void operator()(const uchar* src, uchar* dst, int n) const
    //    {
@@ -364,13 +364,13 @@ namespace imgproc
    //            for( int i = 0; i < n; i++ )
    //            {
    //                int t = src[i];
-    //                ((ushort*)dst)[i] = (ushort)((t >> 3)|((t & ~3) << 3)|((t & ~7) << 8));
+    //                ((unsigned short*)dst)[i] = (unsigned short)((t >> 3)|((t & ~3) << 3)|((t & ~7) << 8));
    //            }
    //        else
    //            for( int i = 0; i < n; i++ )
    //            {
    //                int t = src[i] >> 3;
-    //                ((ushort*)dst)[i] = (ushort)(t|(t << 5)|(t << 10));
+    //                ((unsigned short*)dst)[i] = (unsigned short)(t|(t << 5)|(t << 10));
    //            }
    //    }
    //    int greenBits;
@@ -410,7 +410,7 @@ namespace cv { namespace gpu { namespace impl
        Gray2RGB_caller(src, dst, dstcn, stream);
    }

-    void Gray2RGB_gpu(const DevMem2D_<ushort>& src, const DevMem2D_<ushort>& dst, int dstcn, cudaStream_t stream)
+    void Gray2RGB_gpu(const DevMem2D_<unsigned short>& src, const DevMem2D_<unsigned short>& dst, int dstcn, cudaStream_t stream)
    {
        Gray2RGB_caller(src, dst, dstcn, stream);
    }
@@ -420,7 +420,7 @@ namespace cv { namespace gpu { namespace impl
        Gray2RGB_caller(src, dst, dstcn, stream);
    }
 }}}
-    
+
 ///////////////////////////////// Color to Grayscale ////////////////////////////////

 namespace imgproc
@@ -428,7 +428,7 @@ namespace imgproc
    //#undef R2Y
    //#undef G2Y
    //#undef B2Y
-    //    
+    //
    //enum
    //{
    //    yuv_shift = 14,
@@ -442,14 +442,14 @@ namespace imgproc
    //struct RGB5x52Gray
    //{
    //    typedef uchar channel_type;
-    //    
+    //
    //    RGB5x52Gray(int _greenBits) : greenBits(_greenBits) {}
    //    void operator()(const uchar* src, uchar* dst, int n) const
    //    {
    //        if( greenBits == 6 )
    //            for( int i = 0; i < n; i++ )
    //            {
-    //                int t = ((ushort*)src)[i];
+    //                int t = ((unsigned short*)src)[i];
    //                dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
    //                                           ((t >> 3) & 0xfc)*G2Y +
    //                                           ((t >> 8) & 0xf8)*R2Y, yuv_shift);
@@ -457,7 +457,7 @@ namespace imgproc
    //        else
    //            for( int i = 0; i < n; i++ )
    //            {
-    //                int t = ((ushort*)src)[i];
+    //                int t = ((unsigned short*)src)[i];
    //                dst[i] = (uchar)CV_DESCALE(((t << 3) & 0xf8)*B2Y +
    //                                           ((t >> 2) & 0xf8)*G2Y +
    //                                           ((t >> 7) & 0xf8)*R2Y, yuv_shift);
@@ -472,28 +472,28 @@ namespace imgproc
        const int cg = 9617;
        const int cb = 1868;
        const int yuv_shift = 14;
-        
+
 		const int x = (blockDim.x * blockIdx.x + threadIdx.x) << 2;
 		const int y = blockDim.y * blockIdx.y + threadIdx.y;

        if (y < rows && x < cols)
        {
            const uchar* src = src_ + y * src_step + x * 3;
-						
+
            uchar t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];

            uchar4 dst;
            dst.x = (uchar)CV_DESCALE((unsigned)(t0 * cb + t1 * cg + t2 * cr), yuv_shift);

-            src += 3;    						
+            src += 3;
            t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
            dst.y = (uchar)CV_DESCALE((unsigned)(t0 * cb + t1 * cg + t2 * cr), yuv_shift);

-            src += 3;    						
+            src += 3;
            t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
            dst.z = (uchar)CV_DESCALE((unsigned)(t0 * cb + t1 * cg + t2 * cr), yuv_shift);

-            src += 3;    						
+            src += 3;
            t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
            dst.w = (uchar)CV_DESCALE((unsigned)(t0 * cb + t1 * cg + t2 * cr), yuv_shift);

@@ -501,28 +501,28 @@ namespace imgproc
        }
    }

-    __global__ void RGB2Gray_3(const ushort* src_, size_t src_step, ushort* dst_, size_t dst_step, int rows, int cols, int bidx)
+    __global__ void RGB2Gray_3(const unsigned short* src_, size_t src_step, unsigned short* dst_, size_t dst_step, int rows, int cols, int bidx)
    {
        const int cr = 4899;
        const int cg = 9617;
        const int cb = 1868;
        const int yuv_shift = 14;
-        
+
 		const int x = (blockDim.x * blockIdx.x + threadIdx.x) << 1;
 		const int y = blockDim.y * blockIdx.y + threadIdx.y;

        if (y < rows && x < cols)
        {
-            const ushort* src = src_ + y * src_step + x * 3;
-						
-            ushort t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
+            const unsigned short* src = src_ + y * src_step + x * 3;
+
+            unsigned short t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];

            ushort2 dst;
-            dst.x = (ushort)CV_DESCALE((unsigned)(t0 * cb + t1 * cg + t2 * cr), yuv_shift);
+            dst.x = (unsigned short)CV_DESCALE((unsigned)(t0 * cb + t1 * cg + t2 * cr), yuv_shift);

-            src += 3;    						
+            src += 3;
            t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
-            dst.y = (ushort)CV_DESCALE((unsigned)(t0 * cb + t1 * cg + t2 * cr), yuv_shift);
+            dst.y = (unsigned short)CV_DESCALE((unsigned)(t0 * cb + t1 * cg + t2 * cr), yuv_shift);

            *(ushort2*)(dst_ + y * dst_step + x) = dst;
        }
@@ -533,14 +533,14 @@ namespace imgproc
        const float cr = 0.299f;
        const float cg = 0.587f;
        const float cb = 0.114f;
-        
+
 		const int x = blockDim.x * blockIdx.x + threadIdx.x;
 		const int y = blockDim.y * blockIdx.y + threadIdx.y;

        if (y < rows && x < cols)
        {
            const float* src = src_ + y * src_step + x * 3;
-						
+
            float t0 = src[bidx], t1 = src[1], t2 = src[bidx ^ 2];
            *(dst_ + y * dst_step + x) = t0 * cb + t1 * cg + t2 * cr;
        }
@@ -552,14 +552,14 @@ namespace imgproc
        const int cg = 9617;
        const int cb = 1868;
        const int yuv_shift = 14;
-        
+
 		const int x = (blockDim.x * blockIdx.x + threadIdx.x) << 2;
 		const int y = blockDim.y * blockIdx.y + threadIdx.y;

        if (y < rows && x < cols)
        {
            uchar4 src = *(uchar4*)(src_ + y * src_step + (x << 2));
-						
+
            uchar t0 = ((uchar*)(&src))[bidx], t1 = src.y, t2 = ((uchar*)(&src))[bidx ^ 2];

            uchar4 dst;
@@ -581,28 +581,28 @@ namespace imgproc
        }
    }

-    __global__ void RGB2Gray_4(const ushort* src_, size_t src_step, ushort* dst_, size_t dst_step, int rows, int cols, int bidx)
+    __global__ void RGB2Gray_4(const unsigned short* src_, size_t src_step, unsigned short* dst_, size_t dst_step, int rows, int cols, int bidx)
    {
        const int cr = 4899;
        const int cg = 9617;
        const int cb = 1868;
        const int yuv_shift = 14;
-        
+
 		const int x = (blockDim.x * blockIdx.x + threadIdx.x) << 1;
 		const int y = blockDim.y * blockIdx.y + threadIdx.y;

        if (y < rows && x < cols)
        {
            ushort4 src = *(ushort4*)(src_ + y * src_step + (x << 2));
-						
-            ushort t0 = ((ushort*)(&src))[bidx], t1 = src.y, t2 = ((ushort*)(&src))[bidx ^ 2];
+
+            unsigned short t0 = ((unsigned short*)(&src))[bidx], t1 = src.y, t2 = ((unsigned short*)(&src))[bidx ^ 2];

            ushort2 dst;
-            dst.x = (ushort)CV_DESCALE((unsigned)(t0 * cb + t1 * cg + t2 * cr), yuv_shift);
+            dst.x = (unsigned short)CV_DESCALE((unsigned)(t0 * cb + t1 * cg + t2 * cr), yuv_shift);

            src = *(ushort4*)(src_ + y * src_step + (x << 2) + 4);
-            t0 = ((ushort*)(&src))[bidx], t1 = src.y, t2 = ((ushort*)(&src))[bidx ^ 2];
-            dst.y = (ushort)CV_DESCALE((unsigned)(t0 * cb + t1 * cg + t2 * cr), yuv_shift);
+            t0 = ((unsigned short*)(&src))[bidx], t1 = src.y, t2 = ((unsigned short*)(&src))[bidx ^ 2];
+            dst.y = (unsigned short)CV_DESCALE((unsigned)(t0 * cb + t1 * cg + t2 * cr), yuv_shift);

            *(ushort2*)(dst_ + y * dst_step + x) = dst;
        }
@@ -613,22 +613,22 @@ namespace imgproc
        const float cr = 0.299f;
        const float cg = 0.587f;
        const float cb = 0.114f;
-        
+
 		const int x = blockDim.x * blockIdx.x + threadIdx.x;
 		const int y = blockDim.y * blockIdx.y + threadIdx.y;

        if (y < rows && x < cols)
        {
            float4 src = *(float4*)(src_ + y * src_step + (x << 2));
-						
+
            float t0 = ((float*)(&src))[bidx], t1 = src.y, t2 = ((float*)(&src))[bidx ^ 2];
            *(dst_ + y * dst_step + x) = t0 * cb + t1 * cg + t2 * cr;
        }
    }
 }

-namespace cv { namespace gpu { namespace impl 
-{    
+namespace cv { namespace gpu { namespace impl
+{
    void RGB2Gray_gpu(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream)
    {
        dim3 threads(32, 8, 1);
@@ -654,7 +654,7 @@ namespace cv { namespace gpu { namespace impl
            cudaSafeCall( cudaThreadSynchronize() );
    }

-    void RGB2Gray_gpu(const DevMem2D_<ushort>& src, int srccn, const DevMem2D_<ushort>& dst, int bidx, cudaStream_t stream)
+    void RGB2Gray_gpu(const DevMem2D_<unsigned short>& src, int srccn, const DevMem2D_<unsigned short>& dst, int bidx, cudaStream_t stream)
    {
        dim3 threads(32, 8, 1);
        dim3 grid(1, 1, 1);
@@ -665,10 +665,10 @@ namespace cv { namespace gpu { namespace impl
        switch (srccn)
        {
        case 3:
-            imgproc::RGB2Gray_3<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(ushort), dst.ptr, dst.step / sizeof(ushort), src.rows, src.cols, bidx);
+            imgproc::RGB2Gray_3<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(unsigned short), dst.ptr, dst.step / sizeof(unsigned short), src.rows, src.cols, bidx);
            break;
        case 4:
-            imgproc::RGB2Gray_4<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(ushort), dst.ptr, dst.step / sizeof(ushort), src.rows, src.cols, bidx);
+            imgproc::RGB2Gray_4<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(unsigned short), dst.ptr, dst.step / sizeof(unsigned short), src.rows, src.cols, bidx);
            break;
        default:
            cv::gpu::error("Unsupported channels count", __FILE__, __LINE__);
@@ -704,7 +704,7 @@ namespace cv { namespace gpu { namespace impl
            cudaSafeCall( cudaThreadSynchronize() );
    }
 }}}
-    
+
 ///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////

 //namespace imgproc
@@ -712,14 +712,14 @@ namespace cv { namespace gpu { namespace impl
 //    template<typename _Tp> struct RGB2YCrCb_f
 //    {
 //        typedef _Tp channel_type;
-//        
+//
 //        RGB2YCrCb_f(int _srccn, int _blueIdx, const float* _coeffs) : srccn(_srccn), blueIdx(_blueIdx)
 //	    {
 //		    static const float coeffs0[] = {0.299f, 0.587f, 0.114f, 0.713f, 0.564f};
 //		    memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 5*sizeof(coeffs[0]));
 //		    if(blueIdx==0) std::swap(coeffs[0], coeffs[2]);
 //	    }
-//    	
+//
 //        void operator()(const _Tp* src, _Tp* dst, int n) const
 //        {
 //            int scn = srccn, bidx = blueIdx;
@@ -741,7 +741,7 @@ namespace cv { namespace gpu { namespace impl
 //    template<typename _Tp> struct RGB2YCrCb_i
 //    {
 //        typedef _Tp channel_type;
-//        
+//
 //        RGB2YCrCb_i(int _srccn, int _blueIdx, const int* _coeffs)
 //		    : srccn(_srccn), blueIdx(_blueIdx)
 //	    {
@@ -772,11 +772,11 @@ namespace cv { namespace gpu { namespace impl
 //    template<typename _Tp> struct YCrCb2RGB_f
 //    {
 //        typedef _Tp channel_type;
-//        
+//
 //        YCrCb2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
 //		    : dstcn(_dstcn), blueIdx(_blueIdx)
 //	    {
-//		    static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f}; 
+//		    static const float coeffs0[] = {1.403f, -0.714f, -0.344f, 1.773f};
 //		    memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
 //	    }
 //        void operator()(const _Tp* src, _Tp* dst, int n) const
@@ -790,11 +790,11 @@ namespace cv { namespace gpu { namespace impl
 //                _Tp Y = src[i];
 //                _Tp Cr = src[i+1];
 //                _Tp Cb = src[i+2];
-//                
+//
 //                _Tp b = saturate_cast<_Tp>(Y + (Cb - delta)*C3);
 //                _Tp g = saturate_cast<_Tp>(Y + (Cb - delta)*C2 + (Cr - delta)*C1);
 //                _Tp r = saturate_cast<_Tp>(Y + (Cr - delta)*C0);
-//                
+//
 //                dst[bidx] = b; dst[1] = g; dst[bidx^2] = r;
 //                if( dcn == 4 )
 //                    dst[3] = alpha;
@@ -807,14 +807,14 @@ namespace cv { namespace gpu { namespace impl
 //    template<typename _Tp> struct YCrCb2RGB_i
 //    {
 //        typedef _Tp channel_type;
-//        
+//
 //        YCrCb2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
 //            : dstcn(_dstcn), blueIdx(_blueIdx)
 //        {
-//            static const int coeffs0[] = {22987, -11698, -5636, 29049}; 
+//            static const int coeffs0[] = {22987, -11698, -5636, 29049};
 //		    memcpy(coeffs, _coeffs ? _coeffs : coeffs0, 4*sizeof(coeffs[0]));
 //        }
-//        
+//
 //        void operator()(const _Tp* src, _Tp* dst, int n) const
 //        {
 //            int dcn = dstcn, bidx = blueIdx;
@@ -826,11 +826,11 @@ namespace cv { namespace gpu { namespace impl
 //                _Tp Y = src[i];
 //                _Tp Cr = src[i+1];
 //                _Tp Cb = src[i+2];
-//                
+//
 //                int b = Y + CV_DESCALE((Cb - delta)*C3, yuv_shift);
 //                int g = Y + CV_DESCALE((Cb - delta)*C2 + (Cr - delta)*C1, yuv_shift);
 //                int r = Y + CV_DESCALE((Cr - delta)*C0, yuv_shift);
-//                
+//
 //                dst[bidx] = saturate_cast<_Tp>(b);
 //                dst[1] = saturate_cast<_Tp>(g);
 //                dst[bidx^2] = saturate_cast<_Tp>(r);
@@ -843,10 +843,10 @@ namespace cv { namespace gpu { namespace impl
 //    };
 //}
 //
-//namespace cv { namespace gpu { namespace impl 
+//namespace cv { namespace gpu { namespace impl
 //{
 //}}}
-    
+
 ////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////

 //namespace imgproc
@@ -857,18 +857,18 @@ namespace cv { namespace gpu { namespace impl
 //        0.212671f, 0.715160f, 0.072169f,
 //        0.019334f, 0.119193f, 0.950227f
 //    };
-//        
+//
 //    static const float XYZ2sRGB_D65[] =
 //    {
 //        3.240479f, -1.53715f, -0.498535f,
 //        -0.969256f, 1.875991f, 0.041556f,
 //        0.055648f, -0.204043f, 1.057311f
 //    };
-//        
+//
 //    template<typename _Tp> struct RGB2XYZ_f
 //    {
 //        typedef _Tp channel_type;
-//        
+//
 //        RGB2XYZ_f(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
 //        {
 //            memcpy(coeffs, _coeffs ? _coeffs : sRGB2XYZ_D65, 9*sizeof(coeffs[0]));
@@ -885,7 +885,7 @@ namespace cv { namespace gpu { namespace impl
 //            float C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
 //                  C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
 //                  C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
-//            
+//
 //            n *= 3;
 //            for(int i = 0; i < n; i += 3, src += scn)
 //            {
@@ -902,13 +902,13 @@ namespace cv { namespace gpu { namespace impl
 //    template<typename _Tp> struct RGB2XYZ_i
 //    {
 //        typedef _Tp channel_type;
-//        
+//
 //        RGB2XYZ_i(int _srccn, int blueIdx, const float* _coeffs) : srccn(_srccn)
 //        {
 //            static const int coeffs0[] =
 //            {
-//                1689,    1465,    739,   
-//                871,     2929,    296,   
+//                1689,    1465,    739,
+//                871,     2929,    296,
 //                79,      488,     3892
 //            };
 //            for( int i = 0; i < 9; i++ )
@@ -939,11 +939,11 @@ namespace cv { namespace gpu { namespace impl
 //        int srccn;
 //        int coeffs[9];
 //    };
-//                
+//
 //    template<typename _Tp> struct XYZ2RGB_f
 //    {
 //        typedef _Tp channel_type;
-//        
+//
 //        XYZ2RGB_f(int _dstcn, int _blueIdx, const float* _coeffs)
 //        : dstcn(_dstcn), blueIdx(_blueIdx)
 //        {
@@ -955,7 +955,7 @@ namespace cv { namespace gpu { namespace impl
 //                std::swap(coeffs[2], coeffs[8]);
 //            }
 //        }
-//        
+//
 //        void operator()(const _Tp* src, _Tp* dst, int n) const
 //        {
 //            int dcn = dstcn;
@@ -981,19 +981,19 @@ namespace cv { namespace gpu { namespace impl
 //    template<typename _Tp> struct XYZ2RGB_i
 //    {
 //        typedef _Tp channel_type;
-//        
+//
 //        XYZ2RGB_i(int _dstcn, int _blueIdx, const int* _coeffs)
 //        : dstcn(_dstcn), blueIdx(_blueIdx)
 //        {
 //            static const int coeffs0[] =
 //            {
-//                13273,  -6296,  -2042,  
-//                -3970,   7684,    170,   
+//                13273,  -6296,  -2042,
+//                -3970,   7684,    170,
 //                  228,   -836,   4331
 //            };
 //            for(int i = 0; i < 9; i++)
 //                coeffs[i] = _coeffs ? cvRound(_coeffs[i]*(1 << xyz_shift)) : coeffs0[i];
-//            
+//
 //            if(blueIdx == 0)
 //            {
 //                std::swap(coeffs[0], coeffs[6]);
@@ -1034,15 +1034,15 @@ namespace cv { namespace gpu { namespace impl
 //struct RGB2HSV_b
 //{
 //    typedef uchar channel_type;
-//    
+//
 //    RGB2HSV_b(int _srccn, int _blueIdx, int _hrange)
 //    : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {}
-//    
+//
 //    void operator()(const uchar* src, uchar* dst, int n) const
 //    {
 //        int i, bidx = blueIdx, scn = srccn;
 //        const int hsv_shift = 12;
-//        
+//
 //        static const int div_table[] = {
 //            0, 1044480, 522240, 348160, 261120, 208896, 174080, 149211,
 //            130560, 116053, 104448, 94953, 87040, 80345, 74606, 69632,
@@ -1079,65 +1079,65 @@ namespace cv { namespace gpu { namespace impl
 //        };
 //        int hr = hrange, hscale = hr == 180 ? 15 : 21;
 //        n *= 3;
-//        
+//
 //        for( i = 0; i < n; i += 3, src += scn )
 //        {
 //            int b = src[bidx], g = src[1], r = src[bidx^2];
 //            int h, s, v = b;
 //            int vmin = b, diff;
 //            int vr, vg;
-//            
+//
 //            CV_CALC_MAX_8U( v, g );
 //            CV_CALC_MAX_8U( v, r );
 //            CV_CALC_MIN_8U( vmin, g );
 //            CV_CALC_MIN_8U( vmin, r );
-//            
+//
 //            diff = v - vmin;
 //            vr = v == r ? -1 : 0;
 //            vg = v == g ? -1 : 0;
-//            
+//
 //            s = diff * div_table[v] >> hsv_shift;
 //            h = (vr & (g - b)) +
 //                (~vr & ((vg & (b - r + 2 * diff)) + ((~vg) & (r - g + 4 * diff))));
 //            h = (h * div_table[diff] * hscale + (1 << (hsv_shift + 6))) >> (7 + hsv_shift);
 //            h += h < 0 ? hr : 0;
-//            
+//
 //            dst[i] = (uchar)h;
 //            dst[i+1] = (uchar)s;
 //            dst[i+2] = (uchar)v;
 //        }
 //    }
-//                 
+//
 //    int srccn, blueIdx, hrange;
-//};    
+//};
+//
 //
-//                 
 //struct RGB2HSV_f
 //{
 //    typedef float channel_type;
-//    
+//
 //    RGB2HSV_f(int _srccn, int _blueIdx, float _hrange)
 //    : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {}
-//    
+//
 //    void operator()(const float* src, float* dst, int n) const
 //    {
 //        int i, bidx = blueIdx, scn = srccn;
 //        float hscale = hrange*(1.f/360.f);
 //        n *= 3;
-//    
+//
 //        for( i = 0; i < n; i += 3, src += scn )
 //        {
 //            float b = src[bidx], g = src[1], r = src[bidx^2];
 //            float h, s, v;
-//            
+//
 //            float vmin, diff;
-//            
+//
 //            v = vmin = r;
 //            if( v < g ) v = g;
 //            if( v < b ) v = b;
 //            if( vmin > g ) vmin = g;
 //            if( vmin > b ) vmin = b;
-//            
+//
 //            diff = v - vmin;
 //            s = diff/(float)(fabs(v) + FLT_EPSILON);
 //            diff = (float)(60./(diff + FLT_EPSILON));
@@ -1147,15 +1147,15 @@ namespace cv { namespace gpu { namespace impl
 //                h = (b - r)*diff + 120.f;
 //            else
 //                h = (r - g)*diff + 240.f;
-//            
+//
 //            if( h < 0 ) h += 360.f;
-//            
+//
 //            dst[i] = h*hscale;
 //            dst[i+1] = s;
 //            dst[i+2] = v;
 //        }
 //    }
-//    
+//
 //    int srccn, blueIdx;
 //    float hrange;
 //};
@@ -1164,17 +1164,17 @@ namespace cv { namespace gpu { namespace impl
 //struct HSV2RGB_f
 //{
 //    typedef float channel_type;
-//    
+//
 //    HSV2RGB_f(int _dstcn, int _blueIdx, float _hrange)
 //    : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {}
-//    
+//
 //    void operator()(const float* src, float* dst, int n) const
 //    {
 //        int i, bidx = blueIdx, dcn = dstcn;
 //        float _hscale = hscale;
 //        float alpha = ColorChannel<float>::max();
 //        n *= 3;
-//        
+//
 //        for( i = 0; i < n; i += 3, dst += dcn )
 //        {
 //            float h = src[i], s = src[i+1], v = src[i+2];
@@ -1200,7 +1200,7 @@ namespace cv { namespace gpu { namespace impl
 //                tab[1] = v*(1.f - s);
 //                tab[2] = v*(1.f - s*h);
 //                tab[3] = v*(1.f - s*(1.f - h));
-//                
+//
 //                b = tab[sector_data[sector][0]];
 //                g = tab[sector_data[sector][1]];
 //                r = tab[sector_data[sector][2]];
@@ -1217,26 +1217,26 @@ namespace cv { namespace gpu { namespace impl
 //    int dstcn, blueIdx;
 //    float hscale;
 //};
-//    
+//
 //
 //struct HSV2RGB_b
 //{
 //    typedef uchar channel_type;
-//    
+//
 //    HSV2RGB_b(int _dstcn, int _blueIdx, int _hrange)
 //    : dstcn(_dstcn), cvt(3, _blueIdx, _hrange)
 //    {}
-//    
+//
 //    void operator()(const uchar* src, uchar* dst, int n) const
 //    {
 //        int i, j, dcn = dstcn;
 //        uchar alpha = ColorChannel<uchar>::max();
 //        float buf[3*BLOCK_SIZE];
-//        
+//
 //        for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
 //        {
 //            int dn = std::min(n - i, (int)BLOCK_SIZE);
-//            
+//
 //            for( j = 0; j < dn*3; j += 3 )
 //            {
 //                buf[j] = src[j];
@@ -1244,7 +1244,7 @@ namespace cv { namespace gpu { namespace impl
 //                buf[j+2] = src[j+2]*(1.f/255.f);
 //            }
 //            cvt(buf, buf, dn);
-//            
+//
 //            for( j = 0; j < dn*3; j += 3, dst += dcn )
 //            {
 //                dst[0] = saturate_cast<uchar>(buf[j]*255.f);
@@ -1255,84 +1255,84 @@ namespace cv { namespace gpu { namespace impl
 //            }
 //        }
 //    }
-//    
+//
 //    int dstcn;
 //    HSV2RGB_f cvt;
 //};
 //
-//    
+//
 /////////////////////////////////////// RGB <-> HLS ////////////////////////////////////////
 //
 //struct RGB2HLS_f
 //{
 //    typedef float channel_type;
-//    
+//
 //    RGB2HLS_f(int _srccn, int _blueIdx, float _hrange)
 //    : srccn(_srccn), blueIdx(_blueIdx), hrange(_hrange) {}
-//    
+//
 //    void operator()(const float* src, float* dst, int n) const
 //    {
 //        int i, bidx = blueIdx, scn = srccn;
 //        float hscale = hrange*(1.f/360.f);
 //        n *= 3;
-//        
+//
 //        for( i = 0; i < n; i += 3, src += scn )
 //        {
 //            float b = src[bidx], g = src[1], r = src[bidx^2];
 //            float h = 0.f, s = 0.f, l;
 //            float vmin, vmax, diff;
-//            
+//
 //            vmax = vmin = r;
 //            if( vmax < g ) vmax = g;
 //            if( vmax < b ) vmax = b;
 //            if( vmin > g ) vmin = g;
 //            if( vmin > b ) vmin = b;
-//            
+//
 //            diff = vmax - vmin;
 //            l = (vmax + vmin)*0.5f;
-//            
+//
 //            if( diff > FLT_EPSILON )
 //            {
 //                s = l < 0.5f ? diff/(vmax + vmin) : diff/(2 - vmax - vmin);
 //                diff = 60.f/diff;
-//                
+//
 //                if( vmax == r )
 //                    h = (g - b)*diff;
 //                else if( vmax == g )
 //                    h = (b - r)*diff + 120.f;
 //                else
 //                    h = (r - g)*diff + 240.f;
-//                
+//
 //                if( h < 0.f ) h += 360.f;
 //            }
-//            
+//
 //            dst[i] = h*hscale;
 //            dst[i+1] = l;
 //            dst[i+2] = s;
 //        }
 //    }
-//    
+//
 //    int srccn, blueIdx;
 //    float hrange;
 //};
-//    
-//    
+//
+//
 //struct RGB2HLS_b
 //{
 //    typedef uchar channel_type;
-//    
+//
 //    RGB2HLS_b(int _srccn, int _blueIdx, int _hrange)
 //    : srccn(_srccn), cvt(3, _blueIdx, (float)_hrange) {}
-//    
+//
 //    void operator()(const uchar* src, uchar* dst, int n) const
 //    {
 //        int i, j, scn = srccn;
 //        float buf[3*BLOCK_SIZE];
-//        
+//
 //        for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )
 //        {
 //            int dn = std::min(n - i, (int)BLOCK_SIZE);
-//            
+//
 //            for( j = 0; j < dn*3; j += 3, src += scn )
 //            {
 //                buf[j] = src[0]*(1.f/255.f);
@@ -1340,7 +1340,7 @@ namespace cv { namespace gpu { namespace impl
 //                buf[j+2] = src[2]*(1.f/255.f);
 //            }
 //            cvt(buf, buf, dn);
-//            
+//
 //            for( j = 0; j < dn*3; j += 3 )
 //            {
 //                dst[j] = saturate_cast<uchar>(buf[j]);
@@ -1349,31 +1349,31 @@ namespace cv { namespace gpu { namespace impl
 //            }
 //        }
 //    }
-//    
+//
 //    int srccn;
 //    RGB2HLS_f cvt;
 //};
-//    
+//
 //
 //struct HLS2RGB_f
 //{
 //    typedef float channel_type;
-//    
+//
 //    HLS2RGB_f(int _dstcn, int _blueIdx, float _hrange)
 //    : dstcn(_dstcn), blueIdx(_blueIdx), hscale(6.f/_hrange) {}
-//    
+//
 //    void operator()(const float* src, float* dst, int n) const
 //    {
 //        int i, bidx = blueIdx, dcn = dstcn;
 //        float _hscale = hscale;
 //        float alpha = ColorChannel<float>::max();
 //        n *= 3;
-//        
+//
 //        for( i = 0; i < n; i += 3, dst += dcn )
 //        {
 //            float h = src[i], l = src[i+1], s = src[i+2];
 //            float b, g, r;
-//            
+//
 //            if( s == 0 )
 //                b = g = r = l;
 //            else
@@ -1382,30 +1382,30 @@ namespace cv { namespace gpu { namespace impl
 //                {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
 //                float tab[4];
 //                int sector;
-//                
+//
 //                float p2 = l <= 0.5f ? l*(1 + s) : l + s - l*s;
 //                float p1 = 2*l - p2;
-//                
+//
 //                h *= _hscale;
 //                if( h < 0 )
 //                    do h += 6; while( h < 0 );
 //                else if( h >= 6 )
 //                    do h -= 6; while( h >= 6 );
-//                
+//
 //                assert( 0 <= h && h < 6 );
 //                sector = cvFloor(h);
 //                h -= sector;
-//                
+//
 //                tab[0] = p2;
 //                tab[1] = p1;
 //                tab[2] = p1 + (p2 - p1)*(1-h);
 //                tab[3] = p1 + (p2 - p1)*h;
-//                
+//
 //                b = tab[sector_data[sector][0]];
 //                g = tab[sector_data[sector][1]];
 //                r = tab[sector_data[sector][2]];
 //            }
-//            
+//
 //            dst[bidx] = b;
 //            dst[1] = g;
 //            dst[bidx^2] = r;
@@ -1413,30 +1413,30 @@ namespace cv { namespace gpu { namespace impl
 //                dst[3] = alpha;
 //        }
 //    }
-//        
+//
 //    int dstcn, blueIdx;
 //    float hscale;
 //};
-//    
+//
 //
 //struct HLS2RGB_b
 //{
 //    typedef uchar channel_type;
-//    
+//
 //    HLS2RGB_b(int _dstcn, int _blueIdx, int _hrange)
 //    : dstcn(_dstcn), cvt(3, _blueIdx, _hrange)
 //    {}
-//    
+//
 //    void operator()(const uchar* src, uchar* dst, int n) const
 //    {
 //        int i, j, dcn = dstcn;
 //        uchar alpha = ColorChannel<uchar>::max();
 //        float buf[3*BLOCK_SIZE];
-//        
+//
 //        for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
 //        {
 //            int dn = std::min(n - i, (int)BLOCK_SIZE);
-//            
+//
 //            for( j = 0; j < dn*3; j += 3 )
 //            {
 //                buf[j] = src[j];
@@ -1444,7 +1444,7 @@ namespace cv { namespace gpu { namespace impl
 //                buf[j+2] = src[j+2]*(1.f/255.f);
 //            }
 //            cvt(buf, buf, dn);
-//            
+//
 //            for( j = 0; j < dn*3; j += 3, dst += dcn )
 //            {
 //                dst[0] = saturate_cast<uchar>(buf[j]*255.f);
@@ -1455,12 +1455,12 @@ namespace cv { namespace gpu { namespace impl
 //            }
 //        }
 //    }
-//    
+//
 //    int dstcn;
 //    HLS2RGB_f cvt;
 //};
 //
-//    
+//
 /////////////////////////////////////// RGB <-> L*a*b* /////////////////////////////////////
 //
 //static const float D65[] = { 0.950456f, 1.f, 1.088754f };
@@ -1471,15 +1471,15 @@ namespace cv { namespace gpu { namespace impl
 //
 //static float sRGBGammaTab[GAMMA_TAB_SIZE*4], sRGBInvGammaTab[GAMMA_TAB_SIZE*4];
 //static const float GammaTabScale = (float)GAMMA_TAB_SIZE;
-//    
-//static ushort sRGBGammaTab_b[256], linearGammaTab_b[256];    
+//
+//static unsigned short sRGBGammaTab_b[256], linearGammaTab_b[256];
 //#undef lab_shift
 //#define lab_shift xyz_shift
 //#define gamma_shift 3
 //#define lab_shift2 (lab_shift + gamma_shift)
 //#define LAB_CBRT_TAB_SIZE_B (256*3/2*(1<<gamma_shift))
-//static ushort LabCbrtTab_b[LAB_CBRT_TAB_SIZE_B];
-//    
+//static unsigned short LabCbrtTab_b[LAB_CBRT_TAB_SIZE_B];
+//
 //static void initLabTabs()
 //{
 //    static bool initialized = false;
@@ -1493,7 +1493,7 @@ namespace cv { namespace gpu { namespace impl
 //            f[i] = x < 0.008856f ? x*7.787f + 0.13793103448275862f : cvCbrt(x);
 //        }
 //        splineBuild(f, LAB_CBRT_TAB_SIZE, LabCbrtTab);
-//        
+//
 //        scale = 1.f/GammaTabScale;
 //        for(i = 0; i <= GAMMA_TAB_SIZE; i++)
 //        {
@@ -1503,18 +1503,18 @@ namespace cv { namespace gpu { namespace impl
 //        }
 //        splineBuild(g, GAMMA_TAB_SIZE, sRGBGammaTab);
 //        splineBuild(ig, GAMMA_TAB_SIZE, sRGBInvGammaTab);
-//        
+//
 //        for(i = 0; i < 256; i++)
 //        {
 //            float x = i*(1.f/255.f);
-//            sRGBGammaTab_b[i] = saturate_cast<ushort>(255.f*(1 << gamma_shift)*(x <= 0.04045f ? x*(1.f/12.92f) : (float)pow((double)(x + 0.055)*(1./1.055), 2.4)));
-//            linearGammaTab_b[i] = (ushort)(i*(1 << gamma_shift));
+//            sRGBGammaTab_b[i] = saturate_cast<unsigned short>(255.f*(1 << gamma_shift)*(x <= 0.04045f ? x*(1.f/12.92f) : (float)pow((double)(x + 0.055)*(1./1.055), 2.4)));
+//            linearGammaTab_b[i] = (unsigned short)(i*(1 << gamma_shift));
 //        }
-//        
+//
 //        for(i = 0; i < LAB_CBRT_TAB_SIZE_B; i++)
 //        {
 //            float x = i*(1.f/(255.f*(1 << gamma_shift)));
-//            LabCbrtTab_b[i] = saturate_cast<ushort>((1 << lab_shift2)*(x < 0.008856f ? x*7.787f + 0.13793103448275862f : cvCbrt(x)));
+//            LabCbrtTab_b[i] = saturate_cast<unsigned short>((1 << lab_shift2)*(x < 0.008856f ? x*7.787f + 0.13793103448275862f : cvCbrt(x)));
 //        }
 //        initialized = true;
 //    }
@@ -1524,13 +1524,13 @@ namespace cv { namespace gpu { namespace impl
 //struct RGB2Lab_b
 //{
 //    typedef uchar channel_type;
-//    
+//
 //    RGB2Lab_b(int _srccn, int blueIdx, const float* _coeffs,
 //              const float* _whitept, bool _srgb)
 //    : srccn(_srccn), srgb(_srgb)
 //    {
 //        initLabTabs();
-//        
+//
 //        if(!_coeffs) _coeffs = sRGB2XYZ_D65;
 //        if(!_whitept) _whitept = D65;
 //        float scale[] =
@@ -1539,7 +1539,7 @@ namespace cv { namespace gpu { namespace impl
 //            (float)(1 << lab_shift),
 //            (1 << lab_shift)/_whitept[2]
 //        };
-//        
+//
 //        for( int i = 0; i < 3; i++ )
 //        {
 //            coeffs[i*3+(blueIdx^2)] = cvRound(_coeffs[i*3]*scale[i]);
@@ -1549,55 +1549,55 @@ namespace cv { namespace gpu { namespace impl
 //                      coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 2*(1 << lab_shift) );
 //        }
 //    }
-//    
+//
 //    void operator()(const uchar* src, uchar* dst, int n) const
 //    {
 //        const int Lscale = (116*255+50)/100;
 //        const int Lshift = -((16*255*(1 << lab_shift2) + 50)/100);
-//        const ushort* tab = srgb ? sRGBGammaTab_b : linearGammaTab_b;
+//        const unsigned short* tab = srgb ? sRGBGammaTab_b : linearGammaTab_b;
 //        int i, scn = srccn;
 //        int C0 = coeffs[0], C1 = coeffs[1], C2 = coeffs[2],
 //            C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
 //            C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
 //        n *= 3;
-//        
+//
 //        for( i = 0; i < n; i += 3, src += scn )
 //        {
 //            int R = tab[src[0]], G = tab[src[1]], B = tab[src[2]];
 //            int fX = LabCbrtTab_b[CV_DESCALE(R*C0 + G*C1 + B*C2, lab_shift)];
 //            int fY = LabCbrtTab_b[CV_DESCALE(R*C3 + G*C4 + B*C5, lab_shift)];
 //            int fZ = LabCbrtTab_b[CV_DESCALE(R*C6 + G*C7 + B*C8, lab_shift)];
-//            
+//
 //            int L = CV_DESCALE( Lscale*fY + Lshift, lab_shift2 );
 //            int a = CV_DESCALE( 500*(fX - fY) + 128*(1 << lab_shift2), lab_shift2 );
 //            int b = CV_DESCALE( 200*(fY - fZ) + 128*(1 << lab_shift2), lab_shift2 );
-//            
+//
 //            dst[i] = saturate_cast<uchar>(L);
 //            dst[i+1] = saturate_cast<uchar>(a);
 //            dst[i+2] = saturate_cast<uchar>(b);
 //        }
 //    }
-//    
+//
 //    int srccn;
 //    int coeffs[9];
 //    bool srgb;
 //};
-//    
-//    
+//
+//
 //struct RGB2Lab_f
 //{
 //    typedef float channel_type;
-//    
+//
 //    RGB2Lab_f(int _srccn, int blueIdx, const float* _coeffs,
 //              const float* _whitept, bool _srgb)
 //    : srccn(_srccn), srgb(_srgb)
 //    {
 //        initLabTabs();
-//        
+//
 //        if(!_coeffs) _coeffs = sRGB2XYZ_D65;
 //        if(!_whitept) _whitept = D65;
 //        float scale[] = { LabCbrtTabScale/_whitept[0], LabCbrtTabScale, LabCbrtTabScale/_whitept[2] };
-//        
+//
 //        for( int i = 0; i < 3; i++ )
 //        {
 //            coeffs[i*3+(blueIdx^2)] = _coeffs[i*3]*scale[i];
@@ -1607,7 +1607,7 @@ namespace cv { namespace gpu { namespace impl
 //                       coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 1.5f*LabCbrtTabScale );
 //        }
 //    }
-//    
+//
 //    void operator()(const float* src, float* dst, int n) const
 //    {
 //        int i, scn = srccn;
@@ -1617,7 +1617,7 @@ namespace cv { namespace gpu { namespace impl
 //              C3 = coeffs[3], C4 = coeffs[4], C5 = coeffs[5],
 //              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
 //        n *= 3;
-//        
+//
 //        for( i = 0; i < n; i += 3, src += scn )
 //        {
 //            float R = src[0], G = src[1], B = src[2];
@@ -1627,37 +1627,37 @@ namespace cv { namespace gpu { namespace impl
 //                G = splineInterpolate(G*gscale, gammaTab, GAMMA_TAB_SIZE);
 //                B = splineInterpolate(B*gscale, gammaTab, GAMMA_TAB_SIZE);
 //            }
-//            float fX = splineInterpolate(R*C0 + G*C1 + B*C2, LabCbrtTab, LAB_CBRT_TAB_SIZE); 
+//            float fX = splineInterpolate(R*C0 + G*C1 + B*C2, LabCbrtTab, LAB_CBRT_TAB_SIZE);
 //            float fY = splineInterpolate(R*C3 + G*C4 + B*C5, LabCbrtTab, LAB_CBRT_TAB_SIZE);
 //            float fZ = splineInterpolate(R*C6 + G*C7 + B*C8, LabCbrtTab, LAB_CBRT_TAB_SIZE);
-//            
+//
 //            float L = 116.f*fY - 16.f;
 //            float a = 500.f*(fX - fY);
 //            float b = 200.f*(fY - fZ);
-//            
+//
 //            dst[i] = L; dst[i+1] = a; dst[i+2] = b;
 //        }
 //    }
-//    
+//
 //    int srccn;
 //    float coeffs[9];
 //    bool srgb;
 //};
 //
-//    
+//
 //struct Lab2RGB_f
 //{
 //    typedef float channel_type;
-//    
+//
 //    Lab2RGB_f( int _dstcn, int blueIdx, const float* _coeffs,
 //               const float* _whitept, bool _srgb )
 //    : dstcn(_dstcn), srgb(_srgb)
 //    {
 //        initLabTabs();
-//        
+//
 //        if(!_coeffs) _coeffs = XYZ2sRGB_D65;
 //        if(!_whitept) _whitept = D65;
-//        
+//
 //        for( int i = 0; i < 3; i++ )
 //        {
 //            coeffs[i+(blueIdx^2)*3] = _coeffs[i]*_whitept[i];
@@ -1665,7 +1665,7 @@ namespace cv { namespace gpu { namespace impl
 //            coeffs[i+blueIdx*3] = _coeffs[i+6]*_whitept[i];
 //        }
 //    }
-//    
+//
 //    void operator()(const float* src, float* dst, int n) const
 //    {
 //        int i, dcn = dstcn;
@@ -1676,7 +1676,7 @@ namespace cv { namespace gpu { namespace impl
 //              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
 //        float alpha = ColorChannel<float>::max();
 //        n *= 3;
-//        
+//
 //        for( i = 0; i < n; i += 3, dst += dcn )
 //        {
 //            float L = src[i], a = src[i+1], b = src[i+2];
@@ -1686,48 +1686,48 @@ namespace cv { namespace gpu { namespace impl
 //            Y = Y*Y*Y;
 //            X = X*X*X;
 //            Z = Z*Z*Z;
-//            
+//
 //            float R = X*C0 + Y*C1 + Z*C2;
 //            float G = X*C3 + Y*C4 + Z*C5;
 //            float B = X*C6 + Y*C7 + Z*C8;
-//            
+//
 //            if( gammaTab )
 //            {
 //                R = splineInterpolate(R*gscale, gammaTab, GAMMA_TAB_SIZE);
 //                G = splineInterpolate(G*gscale, gammaTab, GAMMA_TAB_SIZE);
 //                B = splineInterpolate(B*gscale, gammaTab, GAMMA_TAB_SIZE);
 //            }
-//            
+//
 //            dst[0] = R; dst[1] = G; dst[2] = B;
 //            if( dcn == 4 )
 //                dst[3] = alpha;
 //        }
 //    }
-//    
+//
 //    int dstcn;
 //    float coeffs[9];
 //    bool srgb;
 //};
 //
-//    
+//
 //struct Lab2RGB_b
 //{
 //    typedef uchar channel_type;
-//    
+//
 //    Lab2RGB_b( int _dstcn, int blueIdx, const float* _coeffs,
 //               const float* _whitept, bool _srgb )
 //    : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb ) {}
-//    
+//
 //    void operator()(const uchar* src, uchar* dst, int n) const
 //    {
 //        int i, j, dcn = dstcn;
 //        uchar alpha = ColorChannel<uchar>::max();
 //        float buf[3*BLOCK_SIZE];
-//        
+//
 //        for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
 //        {
 //            int dn = std::min(n - i, (int)BLOCK_SIZE);
-//            
+//
 //            for( j = 0; j < dn*3; j += 3 )
 //            {
 //                buf[j] = src[j]*(100.f/255.f);
@@ -1735,7 +1735,7 @@ namespace cv { namespace gpu { namespace impl
 //                buf[j+2] = (float)(src[j+2] - 128);
 //            }
 //            cvt(buf, buf, dn);
-//            
+//
 //            for( j = 0; j < dn*3; j += 3, dst += dcn )
 //            {
 //                dst[0] = saturate_cast<uchar>(buf[j]*255.f);
@@ -1746,27 +1746,27 @@ namespace cv { namespace gpu { namespace impl
 //            }
 //        }
 //    }
-//    
+//
 //    int dstcn;
 //    Lab2RGB_f cvt;
 //};
-//    
-//    
+//
+//
 /////////////////////////////////////// RGB <-> L*u*v* /////////////////////////////////////
 //
 //struct RGB2Luv_f
 //{
 //    typedef float channel_type;
-//    
+//
 //    RGB2Luv_f( int _srccn, int blueIdx, const float* _coeffs,
 //               const float* whitept, bool _srgb )
 //    : srccn(_srccn), srgb(_srgb)
 //    {
 //        initLabTabs();
-//        
+//
 //        if(!_coeffs) _coeffs = sRGB2XYZ_D65;
 //        if(!whitept) whitept = D65;
-//        
+//
 //        for( int i = 0; i < 3; i++ )
 //        {
 //            coeffs[i*3+(blueIdx^2)] = _coeffs[i*3];
@@ -1775,14 +1775,14 @@ namespace cv { namespace gpu { namespace impl
 //            CV_Assert( coeffs[i*3] >= 0 && coeffs[i*3+1] >= 0 && coeffs[i*3+2] >= 0 &&
 //                      coeffs[i*3] + coeffs[i*3+1] + coeffs[i*3+2] < 1.5f );
 //        }
-//        
+//
 //        float d = 1.f/(whitept[0] + whitept[1]*15 + whitept[2]*3);
 //        un = 4*whitept[0]*d;
 //        vn = 9*whitept[1]*d;
-//        
+//
 //        CV_Assert(whitept[1] == 1.f);
 //    }
-//    
+//
 //    void operator()(const float* src, float* dst, int n) const
 //    {
 //        int i, scn = srccn;
@@ -1793,7 +1793,7 @@ namespace cv { namespace gpu { namespace impl
 //              C6 = coeffs[6], C7 = coeffs[7], C8 = coeffs[8];
 //        float _un = 13*un, _vn = 13*vn;
 //        n *= 3;
-//        
+//
 //        for( i = 0; i < n; i += 3, src += scn )
 //        {
 //            float R = src[0], G = src[1], B = src[2];
@@ -1803,55 +1803,55 @@ namespace cv { namespace gpu { namespace impl
 //                G = splineInterpolate(G*gscale, gammaTab, GAMMA_TAB_SIZE);
 //                B = splineInterpolate(B*gscale, gammaTab, GAMMA_TAB_SIZE);
 //            }
-//            
+//
 //            float X = R*C0 + G*C1 + B*C2;
 //            float Y = R*C3 + G*C4 + B*C5;
 //            float Z = R*C6 + G*C7 + B*C8;
-//            
+//
 //            float L = splineInterpolate(Y*LabCbrtTabScale, LabCbrtTab, LAB_CBRT_TAB_SIZE);
 //            L = 116.f*L - 16.f;
-//            
-//            float d = (4*13) / std::max(X + 15 * Y + 3 * Z, FLT_EPSILON);            
+//
+//            float d = (4*13) / std::max(X + 15 * Y + 3 * Z, FLT_EPSILON);
 //            float u = L*(X*d - _un);
 //            float v = L*((9*0.25)*Y*d - _vn);
-//            
+//
 //            dst[i] = L; dst[i+1] = u; dst[i+2] = v;
 //        }
 //    }
-//    
+//
 //    int srccn;
 //    float coeffs[9], un, vn;
 //    bool srgb;
 //};
 //
-//    
+//
 //struct Luv2RGB_f
 //{
 //    typedef float channel_type;
-//    
+//
 //    Luv2RGB_f( int _dstcn, int blueIdx, const float* _coeffs,
 //              const float* whitept, bool _srgb )
 //    : dstcn(_dstcn), srgb(_srgb)
 //    {
 //        initLabTabs();
-//        
+//
 //        if(!_coeffs) _coeffs = XYZ2sRGB_D65;
 //        if(!whitept) whitept = D65;
-//        
+//
 //        for( int i = 0; i < 3; i++ )
 //        {
 //            coeffs[i+(blueIdx^2)*3] = _coeffs[i];
 //            coeffs[i+3] = _coeffs[i+3];
 //            coeffs[i+blueIdx*3] = _coeffs[i+6];
 //        }
-//        
+//
 //        float d = 1.f/(whitept[0] + whitept[1]*15 + whitept[2]*3);
 //        un = 4*whitept[0]*d;
 //        vn = 9*whitept[1]*d;
-//        
+//
 //        CV_Assert(whitept[1] == 1.f);
 //    }
-//    
+//
 //    void operator()(const float* src, float* dst, int n) const
 //    {
 //        int i, dcn = dstcn;
@@ -1863,7 +1863,7 @@ namespace cv { namespace gpu { namespace impl
 //        float alpha = ColorChannel<float>::max();
 //        float _un = un, _vn = vn;
 //        n *= 3;
-//        
+//
 //        for( i = 0; i < n; i += 3, dst += dcn )
 //        {
 //            float L = src[i], u = src[i+1], v = src[i+2], d, X, Y, Z;
@@ -1874,48 +1874,48 @@ namespace cv { namespace gpu { namespace impl
 //            v = v*d + _vn;
 //            float iv = 1.f/v;
 //            X = 2.25f * u * Y * iv ;
-//            Z = (12 - 3 * u - 20 * v) * Y * 0.25 * iv;                
-//                        
+//            Z = (12 - 3 * u - 20 * v) * Y * 0.25 * iv;
+//
 //            float R = X*C0 + Y*C1 + Z*C2;
 //            float G = X*C3 + Y*C4 + Z*C5;
 //            float B = X*C6 + Y*C7 + Z*C8;
-//            
+//
 //            if( gammaTab )
 //            {
 //                R = splineInterpolate(R*gscale, gammaTab, GAMMA_TAB_SIZE);
 //                G = splineInterpolate(G*gscale, gammaTab, GAMMA_TAB_SIZE);
 //                B = splineInterpolate(B*gscale, gammaTab, GAMMA_TAB_SIZE);
 //            }
-//            
+//
 //            dst[0] = R; dst[1] = G; dst[2] = B;
 //            if( dcn == 4 )
 //                dst[3] = alpha;
 //        }
 //    }
-//    
+//
 //    int dstcn;
 //    float coeffs[9], un, vn;
 //    bool srgb;
 //};
 //
-//    
+//
 //struct RGB2Luv_b
 //{
 //    typedef uchar channel_type;
-//    
+//
 //    RGB2Luv_b( int _srccn, int blueIdx, const float* _coeffs,
 //               const float* _whitept, bool _srgb )
 //    : srccn(_srccn), cvt(3, blueIdx, _coeffs, _whitept, _srgb) {}
-//    
+//
 //    void operator()(const uchar* src, uchar* dst, int n) const
 //    {
 //        int i, j, scn = srccn;
 //        float buf[3*BLOCK_SIZE];
-//        
+//
 //        for( i = 0; i < n; i += BLOCK_SIZE, dst += BLOCK_SIZE*3 )
 //        {
 //            int dn = std::min(n - i, (int)BLOCK_SIZE);
-//            
+//
 //            for( j = 0; j < dn*3; j += 3, src += scn )
 //            {
 //                buf[j] = src[0]*(1.f/255.f);
@@ -1923,7 +1923,7 @@ namespace cv { namespace gpu { namespace impl
 //                buf[j+2] = (float)(src[2]*(1.f/255.f));
 //            }
 //            cvt(buf, buf, dn);
-//            
+//
 //            for( j = 0; j < dn*3; j += 3 )
 //            {
 //                dst[j] = saturate_cast<uchar>(buf[j]*2.55f);
@@ -1932,30 +1932,30 @@ namespace cv { namespace gpu { namespace impl
 //            }
 //        }
 //    }
-//    
+//
 //    int srccn;
 //    RGB2Luv_f cvt;
 //};
-//    
+//
 //
 //struct Luv2RGB_b
 //{
 //    typedef uchar channel_type;
-//    
+//
 //    Luv2RGB_b( int _dstcn, int blueIdx, const float* _coeffs,
 //               const float* _whitept, bool _srgb )
 //    : dstcn(_dstcn), cvt(3, blueIdx, _coeffs, _whitept, _srgb ) {}
-//    
+//
 //    void operator()(const uchar* src, uchar* dst, int n) const
 //    {
 //        int i, j, dcn = dstcn;
 //        uchar alpha = ColorChannel<uchar>::max();
 //        float buf[3*BLOCK_SIZE];
-//        
+//
 //        for( i = 0; i < n; i += BLOCK_SIZE, src += BLOCK_SIZE*3 )
 //        {
 //            int dn = std::min(n - i, (int)BLOCK_SIZE);
-//            
+//
 //            for( j = 0; j < dn*3; j += 3 )
 //            {
 //                buf[j] = src[j]*(100.f/255.f);
@@ -1963,7 +1963,7 @@ namespace cv { namespace gpu { namespace impl
 //                buf[j+2] = (float)(src[j+2]*1.003921568627451f - 140.f);
 //            }
 //            cvt(buf, buf, dn);
-//            
+//
 //            for( j = 0; j < dn*3; j += 3, dst += dcn )
 //            {
 //                dst[0] = saturate_cast<uchar>(buf[j]*255.f);
@@ -1974,12 +1974,12 @@ namespace cv { namespace gpu { namespace impl
 //            }
 //        }
 //    }
-//    
+//
 //    int dstcn;
 //    Luv2RGB_f cvt;
 //};
 //
-//        
+//
 ////////////////////////////// Bayer Pattern -> RGB conversion /////////////////////////////
 //
 //static void Bayer2RGB_8u( const Mat& srcmat, Mat& dstmat, int code )
@@ -2079,9 +2079,9 @@ namespace cv { namespace gpu { namespace impl
 //    }
 //}
 //
-//    
+//
 ///////////////////// Demosaicing using Variable Number of Gradients ///////////////////////
-//    
+//
 //static void Bayer2RGB_VNG_8u( const Mat& srcmat, Mat& dstmat, int code )
 //{
 //    const uchar* bayer = srcmat.data;
@@ -2089,45 +2089,45 @@ namespace cv { namespace gpu { namespace impl
 //    uchar* dst = dstmat.data;
 //    int dststep = (int)dstmat.step;
 //    Size size = srcmat.size();
-//    
+//
 //    int blueIdx = code == CV_BayerBG2BGR_VNG || code == CV_BayerGB2BGR_VNG ? 0 : 2;
 //    bool greenCell0 = code != CV_BayerBG2BGR_VNG && code != CV_BayerRG2BGR_VNG;
-//    
+//
 //    // for too small images use the simple interpolation algorithm
 //    if( MIN(size.width, size.height) < 8 )
 //    {
 //        Bayer2RGB_8u( srcmat, dstmat, code );
 //        return;
 //    }
-//    
+//
 //    const int brows = 3, bcn = 7;
-//    int N = size.width, N2 = N*2, N3 = N*3, N4 = N*4, N5 = N*5, N6 = N*6, N7 = N*7;  
+//    int N = size.width, N2 = N*2, N3 = N*3, N4 = N*4, N5 = N*5, N6 = N*6, N7 = N*7;
 //    int i, bufstep = N7*bcn;
-//    cv::AutoBuffer<ushort> _buf(bufstep*brows);
-//    ushort* buf = (ushort*)_buf;
-//    
+//    cv::AutoBuffer<unsigned short> _buf(bufstep*brows);
+//    unsigned short* buf = (unsigned short*)_buf;
+//
 //    bayer += bstep*2;
-//    
+//
 //#if CV_SSE2
 //    bool haveSSE = cv::checkHardwareSupport(CV_CPU_SSE2);
 //    #define _mm_absdiff_epu16(a,b) _mm_adds_epu16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a))
 //#endif
-//    
+//
 //    for( int y = 2; y < size.height - 4; y++ )
 //    {
 //        uchar* dstrow = dst + dststep*y + 6;
 //        const uchar* srow;
-//        
+//
 //        for( int dy = (y == 2 ? -1 : 1); dy <= 1; dy++ )
 //        {
-//            ushort* brow = buf + ((y + dy - 1)%brows)*bufstep + 1;
+//            unsigned short* brow = buf + ((y + dy - 1)%brows)*bufstep + 1;
 //            srow = bayer + (y+dy)*bstep + 1;
-//            
+//
 //            for( i = 0; i < bcn; i++ )
 //                brow[N*i-1] = brow[(N-2) + N*i] = 0;
-//            
+//
 //            i = 1;
-//            
+//
 //#if CV_SSE2
 //            if( haveSSE )
 //            {
@@ -2135,20 +2135,20 @@ namespace cv { namespace gpu { namespace impl
 //                for( ; i <= N-9; i += 8, srow += 8, brow += 8 )
 //                {
 //                    __m128i s1, s2, s3, s4, s6, s7, s8, s9;
-//                    
+//
 //                    s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1-bstep)),z);
 //                    s2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep)),z);
 //                    s3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1-bstep)),z);
-//                    
+//
 //                    s4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1)),z);
 //                    s6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1)),z);
-//                    
+//
 //                    s7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1+bstep)),z);
 //                    s8 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep)),z);
 //                    s9 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+1+bstep)),z);
-//                    
+//
 //                    __m128i b0, b1, b2, b3, b4, b5, b6;
-//                    
+//
 //                    b0 = _mm_adds_epu16(_mm_slli_epi16(_mm_absdiff_epu16(s2,s8),1),
 //                                        _mm_adds_epu16(_mm_absdiff_epu16(s1, s7),
 //                                                       _mm_absdiff_epu16(s3, s9)));
@@ -2157,58 +2157,58 @@ namespace cv { namespace gpu { namespace impl
 //                                                       _mm_absdiff_epu16(s7, s9)));
 //                    b2 = _mm_slli_epi16(_mm_absdiff_epu16(s3,s7),1);
 //                    b3 = _mm_slli_epi16(_mm_absdiff_epu16(s1,s9),1);
-//                    
+//
 //                    _mm_storeu_si128((__m128i*)brow, b0);
 //                    _mm_storeu_si128((__m128i*)(brow + N), b1);
 //                    _mm_storeu_si128((__m128i*)(brow + N2), b2);
 //                    _mm_storeu_si128((__m128i*)(brow + N3), b3);
-//                    
+//
 //                    b4 = _mm_adds_epu16(b2,_mm_adds_epu16(_mm_absdiff_epu16(s2, s4),
 //                                                          _mm_absdiff_epu16(s6, s8)));
 //                    b5 = _mm_adds_epu16(b3,_mm_adds_epu16(_mm_absdiff_epu16(s2, s6),
 //                                                          _mm_absdiff_epu16(s4, s8)));
 //                    b6 = _mm_adds_epu16(_mm_adds_epu16(s2, s4), _mm_adds_epu16(s6, s8));
 //                    b6 = _mm_srli_epi16(b6, 1);
-//                    
+//
 //                    _mm_storeu_si128((__m128i*)(brow + N4), b4);
 //                    _mm_storeu_si128((__m128i*)(brow + N5), b5);
 //                    _mm_storeu_si128((__m128i*)(brow + N6), b6);
 //                }
 //            }
 //#endif
-//            
+//
 //            for( ; i < N-1; i++, srow++, brow++ )
 //            {
-//                brow[0] = (ushort)(std::abs(srow[-1-bstep] - srow[-1+bstep]) +
+//                brow[0] = (unsigned short)(std::abs(srow[-1-bstep] - srow[-1+bstep]) +
 //                                   std::abs(srow[-bstep] - srow[+bstep])*2 +
 //                                   std::abs(srow[1-bstep] - srow[1+bstep]));
-//                brow[N] = (ushort)(std::abs(srow[-1-bstep] - srow[1-bstep]) +
+//                brow[N] = (unsigned short)(std::abs(srow[-1-bstep] - srow[1-bstep]) +
 //                                   std::abs(srow[-1] - srow[1])*2 +
 //                                   std::abs(srow[-1+bstep] - srow[1+bstep]));
-//                brow[N2] = (ushort)(std::abs(srow[+1-bstep] - srow[-1+bstep])*2);
-//                brow[N3] = (ushort)(std::abs(srow[-1-bstep] - srow[1+bstep])*2);
-//                brow[N4] = (ushort)(brow[N2] + std::abs(srow[-bstep] - srow[-1]) +
+//                brow[N2] = (unsigned short)(std::abs(srow[+1-bstep] - srow[-1+bstep])*2);
+//                brow[N3] = (unsigned short)(std::abs(srow[-1-bstep] - srow[1+bstep])*2);
+//                brow[N4] = (unsigned short)(brow[N2] + std::abs(srow[-bstep] - srow[-1]) +
 //                                    std::abs(srow[+bstep] - srow[1]));
-//                brow[N5] = (ushort)(brow[N3] + std::abs(srow[-bstep] - srow[1]) +
+//                brow[N5] = (unsigned short)(brow[N3] + std::abs(srow[-bstep] - srow[1]) +
 //                                    std::abs(srow[+bstep] - srow[-1]));
-//                brow[N6] = (ushort)((srow[-bstep] + srow[-1] + srow[1] + srow[+bstep])>>1);
+//                brow[N6] = (unsigned short)((srow[-bstep] + srow[-1] + srow[1] + srow[+bstep])>>1);
 //            }
 //        }
-//        
-//        const ushort* brow0 = buf + ((y - 2) % brows)*bufstep + 2;
-//        const ushort* brow1 = buf + ((y - 1) % brows)*bufstep + 2;
-//        const ushort* brow2 = buf + (y % brows)*bufstep + 2;
+//
+//        const unsigned short* brow0 = buf + ((y - 2) % brows)*bufstep + 2;
+//        const unsigned short* brow1 = buf + ((y - 1) % brows)*bufstep + 2;
+//        const unsigned short* brow2 = buf + (y % brows)*bufstep + 2;
 //        static const float scale[] = { 0.f, 0.5f, 0.25f, 0.1666666666667f, 0.125f, 0.1f, 0.08333333333f, 0.0714286f, 0.0625f };
 //        srow = bayer + y*bstep + 2;
 //        bool greenCell = greenCell0;
-//        
+//
 //        i = 2;
-//#if CV_SSE2        
+//#if CV_SSE2
 //        int limit = !haveSSE ? N-2 : greenCell ? std::min(3, N-2) : 2;
 //#else
 //        int limit = N - 2;
 //#endif
-//        
+//
 //        do
 //        {
 //            for( ; i < limit; i++, srow++, brow0++, brow1++, brow2++, dstrow += 3 )
@@ -2220,18 +2220,18 @@ namespace cv { namespace gpu { namespace impl
 //                int minGrad = std::min(std::min(std::min(gradN, gradS), gradW), gradE);
 //                int maxGrad = std::max(std::max(std::max(gradN, gradS), gradW), gradE);
 //                int R, G, B;
-//                
+//
 //                if( !greenCell )
 //                {
 //                    int gradNE = brow0[N4+1] + brow1[N4];
 //                    int gradSW = brow1[N4] + brow2[N4-1];
 //                    int gradNW = brow0[N5-1] + brow1[N5];
 //                    int gradSE = brow1[N5] + brow2[N5+1];
-//                    
+//
 //                    minGrad = std::min(std::min(std::min(std::min(minGrad, gradNE), gradSW), gradNW), gradSE);
 //                    maxGrad = std::max(std::max(std::max(std::max(maxGrad, gradNE), gradSW), gradNW), gradSE);
 //                    int T = minGrad + maxGrad/2;
-//                    
+//
 //                    int Rs = 0, Gs = 0, Bs = 0, ng = 0;
 //                    if( gradN < T )
 //                    {
@@ -2291,7 +2291,7 @@ namespace cv { namespace gpu { namespace impl
 //                    }
 //                    R = srow[0];
 //                    G = R + cvRound((Gs - Rs)*scale[ng]);
-//                    B = R + cvRound((Bs - Rs)*scale[ng]); 
+//                    B = R + cvRound((Bs - Rs)*scale[ng]);
 //                }
 //                else
 //                {
@@ -2299,11 +2299,11 @@ namespace cv { namespace gpu { namespace impl
 //                    int gradSW = brow1[N2] + brow1[N2-1] + brow2[N2] + brow2[N2-1];
 //                    int gradNW = brow0[N3] + brow0[N3-1] + brow1[N3] + brow1[N3-1];
 //                    int gradSE = brow1[N3] + brow1[N3+1] + brow2[N3] + brow2[N3+1];
-//                    
+//
 //                    minGrad = std::min(std::min(std::min(std::min(minGrad, gradNE), gradSW), gradNW), gradSE);
 //                    maxGrad = std::max(std::max(std::max(std::max(maxGrad, gradNE), gradSW), gradNW), gradSE);
 //                    int T = minGrad + maxGrad/2;
-//                    
+//
 //                    int Rs = 0, Gs = 0, Bs = 0, ng = 0;
 //                    if( gradN < T )
 //                    {
@@ -2370,21 +2370,20 @@ namespace cv { namespace gpu { namespace impl
 //                dstrow[blueIdx^2] = CV_CAST_8U(R);
 //                greenCell = !greenCell;
 //            }
-//            
+//
 //#if CV_SSE2
 //            if( !haveSSE )
 //                break;
-//            
+//
 //            __m128i emask = _mm_set1_epi32(0x0000ffff),
 //            omask = _mm_set1_epi32(0xffff0000),
 //            z = _mm_setzero_si128();
 //            __m128 _0_5 = _mm_set1_ps(0.5f);
-//            
-//            #define _mm_merge_epi16(a, b) \
-//                _mm_or_si128(_mm_and_si128(a, emask), _mm_and_si128(b, omask))
-//            #define _mm_cvtloepi16_ps(a) _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(a,a), 16))
-//            #define _mm_cvthiepi16_ps(a) _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(a,a), 16))
-//            
+//
+//            #define _mm_merge_epi16(a, b) _mm_or_si128(_mm_and_si128(a, emask), _mm_and_si128(b, omask))
+//            #define _mm_cvtloepi16_ps(a)  _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(a,a), 16))
+//            #define _mm_cvthiepi16_ps(a)  _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(a,a), 16))
+//
 //            // process 8 pixels at once
 //            for( ; i <= N - 10; i += 8, srow += 8, brow0 += 8, brow1 += 8, brow2 += 8 )
 //            {
@@ -2397,13 +2396,13 @@ namespace cv { namespace gpu { namespace impl
 //                                       _mm_loadu_si128((__m128i*)(brow1+N)));
 //                gradE = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N+1)),
 //                                       _mm_loadu_si128((__m128i*)(brow1+N)));
-//                
+//
 //                __m128i minGrad, maxGrad, T;
 //                minGrad = _mm_min_epi16(_mm_min_epi16(_mm_min_epi16(gradN, gradS), gradW), gradE);
 //                maxGrad = _mm_max_epi16(_mm_max_epi16(_mm_max_epi16(gradN, gradS), gradW), gradE);
-//                
+//
 //                __m128i grad0, grad1;
-//                
+//
 //                grad0 = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow0+N4+1)),
 //                                       _mm_loadu_si128((__m128i*)(brow1+N4)));
 //                grad1 = _mm_adds_epu16(_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow0+N2)),
@@ -2411,7 +2410,7 @@ namespace cv { namespace gpu { namespace impl
 //                                       _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N2)),
 //                                                      _mm_loadu_si128((__m128i*)(brow1+N2+1))));
 //                gradNE = _mm_srli_epi16(_mm_merge_epi16(grad0, grad1), 1);
-//                
+//
 //                grad0 = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow2+N4-1)),
 //                                       _mm_loadu_si128((__m128i*)(brow1+N4)));
 //                grad1 = _mm_adds_epu16(_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow2+N2)),
@@ -2419,10 +2418,10 @@ namespace cv { namespace gpu { namespace impl
 //                                       _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N2)),
 //                                                      _mm_loadu_si128((__m128i*)(brow1+N2-1))));
 //                gradSW = _mm_srli_epi16(_mm_merge_epi16(grad0, grad1), 1);
-//                
+//
 //                minGrad = _mm_min_epi16(_mm_min_epi16(minGrad, gradNE), gradSW);
 //                maxGrad = _mm_max_epi16(_mm_max_epi16(maxGrad, gradNE), gradSW);
-//                
+//
 //                grad0 = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow0+N5-1)),
 //                                       _mm_loadu_si128((__m128i*)(brow1+N5)));
 //                grad1 = _mm_adds_epu16(_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow0+N3)),
@@ -2430,7 +2429,7 @@ namespace cv { namespace gpu { namespace impl
 //                                       _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N3)),
 //                                                      _mm_loadu_si128((__m128i*)(brow1+N3-1))));
 //                gradNW = _mm_srli_epi16(_mm_merge_epi16(grad0, grad1), 1);
-//                
+//
 //                grad0 = _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow2+N5+1)),
 //                                       _mm_loadu_si128((__m128i*)(brow1+N5)));
 //                grad1 = _mm_adds_epu16(_mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow2+N3)),
@@ -2438,18 +2437,18 @@ namespace cv { namespace gpu { namespace impl
 //                                       _mm_adds_epu16(_mm_loadu_si128((__m128i*)(brow1+N3)),
 //                                                      _mm_loadu_si128((__m128i*)(brow1+N3+1))));
 //                gradSE = _mm_srli_epi16(_mm_merge_epi16(grad0, grad1), 1);
-//                
+//
 //                minGrad = _mm_min_epi16(_mm_min_epi16(minGrad, gradNW), gradSE);
 //                maxGrad = _mm_max_epi16(_mm_max_epi16(maxGrad, gradNW), gradSE);
-//                
+//
 //                T = _mm_add_epi16(_mm_srli_epi16(maxGrad, 1), minGrad);
 //                __m128i RGs = z, GRs = z, Bs = z, ng = z, mask;
-//                
+//
 //                __m128i t0, t1, x0, x1, x2, x3, x4, x5, x6, x7, x8,
 //                x9, x10, x11, x12, x13, x14, x15, x16;
-//                
+//
 //                x0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)srow), z);
-//                
+//
 //                x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep-1)), z);
 //                x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2-1)), z);
 //                x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep)), z);
@@ -2466,147 +2465,147 @@ namespace cv { namespace gpu { namespace impl
 //                x14 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep-2)), z);
 //                x15 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-1)), z);
 //                x16 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep-2)), z);
-//                
+//
 //                // gradN
 //                mask = _mm_cmpgt_epi16(T, gradN);
 //                ng = _mm_sub_epi16(ng, mask);
-//                
+//
 //                t0 = _mm_slli_epi16(x3, 1);
 //                t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2)), z), x0);
-//                
+//
 //                RGs = _mm_adds_epu16(RGs, _mm_and_si128(t1, mask));
 //                GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epu16(x2,x4)), mask));
 //                Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epu16(x1,x5), t0), mask));
-//                
+//
 //                // gradNE
 //                mask = _mm_cmpgt_epi16(T, gradNE);
 //                ng = _mm_sub_epi16(ng, mask);
-//                
+//
 //                t0 = _mm_slli_epi16(x5, 1);
 //                t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2+2)), z), x0);
-//                
+//
 //                RGs = _mm_adds_epu16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
 //                GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow0+N6+1)),
 //                                                                        _mm_adds_epu16(x4,x7)), mask));
 //                Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epu16(x3,x6)), mask));
-//                
+//
 //                // gradE
 //                mask = _mm_cmpgt_epi16(T, gradE);
 //                ng = _mm_sub_epi16(ng, mask);
-//                
+//
 //                t0 = _mm_slli_epi16(x7, 1);
 //                t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+2)), z), x0);
-//                
+//
 //                RGs = _mm_adds_epu16(RGs, _mm_and_si128(t1, mask));
 //                GRs = _mm_adds_epu16(GRs, _mm_and_si128(t0, mask));
 //                Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epu16(x5,x9),
 //                                                                      _mm_adds_epu16(x6,x8)), mask));
-//                
+//
 //                // gradSE
 //                mask = _mm_cmpgt_epi16(T, gradSE);
 //                ng = _mm_sub_epi16(ng, mask);
-//                
+//
 //                t0 = _mm_slli_epi16(x9, 1);
 //                t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep*2+2)), z), x0);
-//                
+//
 //                RGs = _mm_adds_epu16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
 //                GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow2+N6+1)),
 //                                                                        _mm_adds_epu16(x7,x10)), mask));
 //                Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epu16(x8,x11)), mask));
-//                
+//
 //                // gradS
 //                mask = _mm_cmpgt_epi16(T, gradS);
 //                ng = _mm_sub_epi16(ng, mask);
-//                
+//
 //                t0 = _mm_slli_epi16(x11, 1);
 //                t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep*2)), z), x0);
-//                
+//
 //                RGs = _mm_adds_epu16(RGs, _mm_and_si128(t1, mask));
 //                GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(t0, _mm_adds_epu16(x10,x12)), mask));
 //                Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epu16(x9,x13), t0), mask));
-//                
+//
 //                // gradSW
 //                mask = _mm_cmpgt_epi16(T, gradSW);
 //                ng = _mm_sub_epi16(ng, mask);
-//                
+//
 //                t0 = _mm_slli_epi16(x13, 1);
 //                t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow+bstep*2-2)), z), x0);
-//                
+//
 //                RGs = _mm_adds_epu16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
 //                GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow2+N6-1)),
 //                                                                        _mm_adds_epu16(x12,x15)), mask));
 //                Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epu16(x11,x14)), mask));
-//                
+//
 //                // gradW
 //                mask = _mm_cmpgt_epi16(T, gradW);
 //                ng = _mm_sub_epi16(ng, mask);
-//                
+//
 //                t0 = _mm_slli_epi16(x15, 1);
 //                t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-2)), z), x0);
-//                
+//
 //                RGs = _mm_adds_epu16(RGs, _mm_and_si128(t1, mask));
 //                GRs = _mm_adds_epu16(GRs, _mm_and_si128(t0, mask));
 //                Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(_mm_adds_epu16(x1,x13),
 //                                                                      _mm_adds_epu16(x14,x16)), mask));
-//                
+//
 //                // gradNW
 //                mask = _mm_cmpgt_epi16(T, gradNW);
 //                ng = _mm_sub_epi16(ng, mask);
-//                
+//
 //                __m128 ngf0, ngf1;
 //                ngf0 = _mm_div_ps(_0_5, _mm_cvtloepi16_ps(ng));
 //                ngf1 = _mm_div_ps(_0_5, _mm_cvthiepi16_ps(ng));
-//                
+//
 //                t0 = _mm_slli_epi16(x1, 1);
 //                t1 = _mm_adds_epu16(_mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow-bstep*2-2)), z), x0);
-//                
+//
 //                RGs = _mm_adds_epu16(RGs, _mm_and_si128(_mm_merge_epi16(t1, t0), mask));
 //                GRs = _mm_adds_epu16(GRs, _mm_and_si128(_mm_merge_epi16(_mm_loadu_si128((__m128i*)(brow0+N6-1)),
 //                                                                        _mm_adds_epu16(x2,x15)), mask));
 //                Bs = _mm_adds_epu16(Bs, _mm_and_si128(_mm_merge_epi16(t0,_mm_adds_epu16(x3,x16)), mask));
-//                
+//
 //                // now interpolate r, g & b
 //                t0 = _mm_sub_epi16(GRs, RGs);
 //                t1 = _mm_sub_epi16(Bs, RGs);
-//                
+//
 //                t0 = _mm_add_epi16(x0, _mm_packs_epi32(
 //                                                       _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtloepi16_ps(t0), ngf0)),
 //                                                       _mm_cvtps_epi32(_mm_mul_ps(_mm_cvthiepi16_ps(t0), ngf1))));
-//                
+//
 //                t1 = _mm_add_epi16(x0, _mm_packs_epi32(
 //                                                       _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtloepi16_ps(t1), ngf0)),
 //                                                       _mm_cvtps_epi32(_mm_mul_ps(_mm_cvthiepi16_ps(t1), ngf1))));
-//                
+//
 //                x1 = _mm_merge_epi16(x0, t0);
 //                x2 = _mm_merge_epi16(t0, x0);
-//                
+//
 //                uchar R[8], G[8], B[8];
-//                
+//
 //                _mm_storel_epi64(blueIdx ? (__m128i*)B : (__m128i*)R, _mm_packus_epi16(x1, z));
 //                _mm_storel_epi64((__m128i*)G, _mm_packus_epi16(x2, z));
 //                _mm_storel_epi64(blueIdx ? (__m128i*)R : (__m128i*)B, _mm_packus_epi16(t1, z));
-//                
+//
 //                for( int j = 0; j < 8; j++, dstrow += 3 )
 //                {
 //                    dstrow[0] = B[j]; dstrow[1] = G[j]; dstrow[2] = R[j];
 //                }
 //            }
 //#endif
-//            
+//
 //            limit = N - 2;
 //        }
 //        while( i < N - 2 );
-//        
+//
 //        for( i = 0; i < 6; i++ )
 //        {
 //            dst[dststep*y + 5 - i] = dst[dststep*y + 8 - i];
 //            dst[dststep*y + (N - 2)*3 + i] = dst[dststep*y + (N - 3)*3 + i];
 //        }
-//        
+//
 //        greenCell0 = !greenCell0;
 //        blueIdx ^= 2;
 //    }
-//    
+//
 //    for( i = 0; i < size.width*3; i++ )
 //    {
 //        dst[i] = dst[i + dststep] = dst[i + dststep*2];