update docs

minor fixes and refactoring of GPU module

update docs
minor fixes and refactoring of GPU module
54fa600b · Vladislav Vinogradov · 7d42dbdd · 54fa600b · 54fa600b · 54fa600b
Commit 54fa600b authored Feb 16, 2011 by Vladislav Vinogradov
16 changed files
--- a/doc/gpu_features2d.tex
+++ b/doc/gpu_features2d.tex
 \section{Feature Detection and Description}


-\cvclass{gpu::SURF\_GPU}
+\cvclass{gpu::SURFParams\_GPU}\label{class.gpu.SURFParams}
+Various SURF algorithm parameters.
+
+\begin{lstlisting}
+struct SURFParams_GPU 
+{
+    SURFParams_GPU() : threshold(0.1f), nOctaves(4), nIntervals(4), 
+        initialScale(2.f), l1(3.f/1.5f), l2(5.f/1.5f), l3(3.f/1.5f), 
+        l4(1.f/1.5f), edgeScale(0.81f), initialStep(1), extended(true), 
+        featuresRatio(0.01f) {}
+
+    //! The interest operator threshold
+    float threshold;
+    //! The number of octaves to process
+    int nOctaves;
+    //! The number of intervals in each octave
+    int nIntervals;
+    //! The scale associated with the first interval of the first octave
+    float initialScale;
+
+    //! mask parameter l_1
+    float l1;
+    //! mask parameter l_2 
+    float l2;
+    //! mask parameter l_3
+    float l3;
+    //! mask parameter l_4
+    float l4;
+    //! The amount to scale the edge rejection mask
+    float edgeScale;
+    //! The initial sampling step in pixels.
+    int initialStep;
+
+    //! True, if generate 128-len descriptors, false - 64-len descriptors
+    bool extended;
+
+    //! max features = featuresRatio * img.size().area()
+    float featuresRatio;
+};
+\end{lstlisting}
+
+In contrast to \hyperref[cv.class.SURF]{cv::SURF} \texttt{SURF\_GPU} works with float sources (with range [0..1]). It performs conversion after calculation of the integral by division result by 255. Please take it into consideration when change some parameters (like hessian threshold).
+
+Current \texttt{SURF\_GPU} implementation supports the number of intervals in each octave in range [3..21].
+
+See also: \hyperref[class.gpu.SURF]{cv::gpu::SURF\_GPU}.
+
+
+\cvclass{gpu::SURF\_GPU}\label{class.gpu.SURF}
 Class for extracting Speeded Up Robust Features from an image.

 \begin{lstlisting}
@@ -62,7 +110,7 @@ The class \texttt{SURF\_GPU} can store results to GPU and CPU memory and provide

 The class \texttt{SURF\_GPU} uses some buffers and provides access to it. All buffers can be safely released between function calls. 

-See also: \hyperref[cv.class.SURF]{cv::SURF}.
+See also: \hyperref[cv.class.SURF]{cv::SURF}, \hyperref[class.gpu.SURFParams]{cv::gpu::SURFParams\_GPU}.


 \cvclass{gpu::BruteForceMatcher\_GPU}
@@ -269,7 +317,7 @@ void radiusMatch(const GpuMat\& queryDescs, \par const GpuMat\& trainDescs, \par
 void radiusMatch(const GpuMat\& queryDescs, \par std::vector< std::vector<DMatch> >\& matches, \par float maxDistance, \par const std::vector<GpuMat>\& masks = std::vector<GpuMat>(), \par bool compactResult = false);
 }

-This function works only on devices with Compute Capability $>=$ 1.1.
+\textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.1.

 See also: \cvCppCross{DescriptorMatcher::radiusMatch}.

@@ -293,7 +341,8 @@ void radiusMatch(const GpuMat\& queryDescs, \par const GpuMat\& trainDescs, \par

 In contrast to \hyperref[cppfunc.gpu.BruteForceMatcher.radiusMatch]{cv::gpu::BruteForceMather\_GPU::radiusMatch} results are not sorted by distance increasing order.

-This function works only on devices with Compute Capability $>=$ 1.1.
+\textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.1.
+

 \cvfunc{cv::gpu::BruteForceMatcher\_GPU::radiusMatchDownload}\label{cppfunc.gpu.BruteForceMatcher.radiusMatchDownload}
 Downloads \texttt{trainIdx}, \texttt{nMatches} and \texttt{distance} matrices obtained via \hyperref[cppfunc.gpu.BruteForceMatcher.radiusMatchSingle]{radiusMatch} to CPU vector with \hyperref[cv.class.DMatch]{cv::DMatch}. If \texttt{compactResult} is true \texttt{matches} vector will not contain matches for fully masked out query descriptors.

--- a/doc/gpu_image_processing.tex
+++ b/doc/gpu_image_processing.tex
@@ -17,6 +17,8 @@ Performs mean-shift filtering for each point of the source image. It maps each p
 \cvarg{criteria}{Termination criteria. See \hyperref[TermCriteria]{cv::TermCriteria}.}
 \end{description}

+\textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.2.
+

 \cvCppFunc{gpu::meanShiftProc}
 Performs mean-shift procedure and stores information about processed points (i.e. their colors and positions) into two images. 
@@ -35,6 +37,8 @@ Performs mean-shift procedure and stores information about processed points (i.e
 \cvarg{criteria}{Termination criteria. See \hyperref[TermCriteria]{cv::TermCriteria}.}
 \end{description}

+\textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.2.
+
 See also: \cvCppCross{gpu::meanShiftFiltering}.


@@ -55,6 +59,8 @@ Performs mean-shift segmentation of the source image and eleminates small segmen
 \cvarg{criteria}{Termination criteria. See \hyperref[TermCriteria]{cv::TermCriteria}.}
 \end{description}

+\textbf{Please note:} This function works only on devices with Compute Capability $>=$ 1.2.
+

 \cvCppFunc{gpu::integral}
 Computes integral image and squared integral image.
@@ -319,7 +325,7 @@ double threshold(const GpuMat\& src, GpuMat\& dst, double thresh, \par double ma
 }

 \begin{description}
-\cvarg{src}{Source array (single-channel, \texttt{CV\_64F} depth isn't supported).}
+\cvarg{src}{Source array (single-channel).}
 \cvarg{dst}{Destination array; will have the same size and the same type as \texttt{src}.}
 \cvarg{thresh}{Threshold value.}
 \cvarg{maxVal}{Maximum value to use with \texttt{THRESH\_BINARY} and \texttt{THRESH\_BINARY\_INV} thresholding types.}

--- a/doc/opencv.pdf
+++ b/doc/opencv.pdf
--- a/modules/gpu/src/cuda/brute_force_matcher.cu
+++ b/modules/gpu/src/cuda/brute_force_matcher.cu
@@ -582,10 +582,10 @@ namespace cv { namespace gpu { namespace bfmatcher
    }
    
    ///////////////////////////////////////////////////////////////////////////////
-    // Match kernel chooser
+    // Match caller

    template <typename Dist, typename T, typename Train, typename Mask>
-    void match_chooser(const DevMem2D_<T>& queryDescs, const Train& train, 
+    void matchDispatcher(const DevMem2D_<T>& queryDescs, const Train& train, 
        const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
        bool cc_12)
    {
@@ -616,11 +616,11 @@ namespace cv { namespace gpu { namespace bfmatcher
        if (mask.data)
        {
            SingleMask m(mask);
-            match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
        }
        else
        {
-            match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
        }
    }

@@ -640,11 +640,11 @@ namespace cv { namespace gpu { namespace bfmatcher
        if (mask.data)
        {
            SingleMask m(mask);
-            match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance, cc_12);
        }
        else
        {
-            match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
        }
    }

@@ -664,11 +664,11 @@ namespace cv { namespace gpu { namespace bfmatcher
        if (maskCollection.data)
        {
            MaskCollection mask(maskCollection.data);
-            match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
        }
        else
        {
-            match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
        }
    }

@@ -688,11 +688,11 @@ namespace cv { namespace gpu { namespace bfmatcher
        if (maskCollection.data)
        {
            MaskCollection mask(maskCollection.data);
-            match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance, cc_12);
        }
        else
        {
-            match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
+            matchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance, cc_12);
        }
    }

@@ -942,22 +942,35 @@ namespace cv { namespace gpu { namespace bfmatcher
    ///////////////////////////////////////////////////////////////////////////////
    // knn match caller

+    template <typename Dist, typename T, typename Mask>
+    void calcDistanceDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, 
+        const Mask& mask, const DevMem2Df& allDist)
+    {
+        calcDistance_caller<16, 16, Dist>(queryDescs, trainDescs, mask, allDist);
+    }
+
+    void findKnnMatchDispatcher(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, 
+        const DevMem2Df& allDist)
+    {
+        findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);
+    }
+
    template <typename T>
    void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
        const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)
    {
        if (mask.data)
        {
-            calcDistance_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            calcDistanceDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                SingleMask(mask), allDist);
        }
        else
        {
-            calcDistance_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            calcDistanceDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                WithOutMask(), allDist);
        }

-        findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);
+        findKnnMatchDispatcher(knn, trainIdx, distance, allDist);
    }

    template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
@@ -973,16 +986,16 @@ namespace cv { namespace gpu { namespace bfmatcher
    {
        if (mask.data)
        {
-            calcDistance_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            calcDistanceDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                SingleMask(mask), allDist);
        }
        else
        {
-            calcDistance_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            calcDistanceDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                WithOutMask(), allDist);
        }

-        findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);
+        findKnnMatchDispatcher(knn, trainIdx, distance, allDist);
    }

    template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
@@ -1061,7 +1074,16 @@ namespace cv { namespace gpu { namespace bfmatcher
    }
    
    ///////////////////////////////////////////////////////////////////////////////
-    // Radius Match kernel chooser
+    // Radius Match caller
+
+    template <typename Dist, typename T, typename Mask>
+    void radiusMatchDispatcher(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs, 
+        float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, 
+        const DevMem2Df& distance)
+    {
+        radiusMatch_caller<16, 16, Dist>(queryDescs, trainDescs, maxDistance, mask, 
+            trainIdx, nMatches, distance);
+    }

    template <typename T>
    void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
@@ -1069,12 +1091,12 @@ namespace cv { namespace gpu { namespace bfmatcher
    {
        if (mask.data)
        {
-            radiusMatch_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            radiusMatchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                maxDistance, SingleMask(mask), trainIdx, nMatches, distance);
        }
        else
        {
-            radiusMatch_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            radiusMatchDispatcher<L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                maxDistance, WithOutMask(), trainIdx, nMatches, distance);
        }
    }
@@ -1092,12 +1114,12 @@ namespace cv { namespace gpu { namespace bfmatcher
    {
        if (mask.data)
        {
-            radiusMatch_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            radiusMatchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                maxDistance, SingleMask(mask), trainIdx, nMatches, distance);
        }
        else
        {
-            radiusMatch_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
+            radiusMatchDispatcher<L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs, 
                maxDistance, WithOutMask(), trainIdx, nMatches, distance);
        }
    }

--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@@ -44,6 +44,7 @@
 #include "opencv2/gpu/device/saturate_cast.hpp"
 #include "opencv2/gpu/device/vecmath.hpp"
 #include "opencv2/gpu/device/limits_gpu.hpp"
+#include "opencv2/gpu/device/transform.hpp"

 using namespace cv::gpu;
 using namespace cv::gpu::device;
@@ -94,20 +95,27 @@ namespace cv { namespace gpu { namespace color
        return vec.w;
    }

+    template <typename Cvt>
+    void callConvert(const DevMem2D& src, const DevMem2D& dst, const Cvt& cvt, cudaStream_t stream)
+    {
+        typedef typename Cvt::src_t src_t;
+        typedef typename Cvt::dst_t dst_t;
+
+        transform((DevMem2D_<src_t>)src, (DevMem2D_<dst_t>)dst, cvt, stream);
+    }
+
 ////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////

-    template <int SRCCN, int DSTCN, typename T>
-    __global__ void RGB2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
+    template <typename T, int SRCCN, int DSTCN>
+    struct RGB2RGB
    {
        typedef typename TypeVec<T, SRCCN>::vec_t src_t;
        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;

-		const int x = blockDim.x * blockIdx.x + threadIdx.x;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        explicit RGB2RGB(int bidx) : bidx(bidx) {}

-        if (y < rows && x < cols)
+        __device__ dst_t operator()(const src_t& src) const
        {
-            src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));
            dst_t dst;

            dst.x = (&src.x)[bidx];
@@ -115,25 +123,18 @@ namespace cv { namespace gpu { namespace color
            dst.z = (&src.x)[bidx ^ 2];
            setAlpha(dst, getAlpha<T>(src));

-            *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
-        }
+            return dst;
        }

+    private:
+        int bidx;
+    };
+
    template <typename T, int SRCCN, int DSTCN>
    void RGB2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
-
-        grid.x = divUp(src.cols, threads.x);
-        grid.y = divUp(src.rows, threads.y);
-
-        RGB2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
-            dst.data, dst.step, src.rows, src.cols, bidx);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+        RGB2RGB<T, SRCCN, DSTCN> cvt(bidx);
+        callConvert(src, dst, cvt, stream);
    }

    void RGB2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, cudaStream_t stream)
@@ -174,110 +175,90 @@ namespace cv { namespace gpu { namespace color

 /////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////

-    template <int GREEN_BITS, int DSTCN> struct RGB5x52RGBConverter {};    
-    template <int DSTCN> struct RGB5x52RGBConverter<5, DSTCN>
+    template <int GREEN_BITS> struct RGB5x52RGBConverter;    
+    template <> struct RGB5x52RGBConverter<5>
    {
-        typedef typename TypeVec<uchar, DSTCN>::vec_t dst_t;
-
-        static __device__ dst_t cvt(uint src, int bidx)
+        template <typename D>
+        static __device__ void cvt(uint src, D& dst, int bidx)
        {            
-            dst_t dst;
-            
            (&dst.x)[bidx] = (uchar)(src << 3);
            dst.y = (uchar)((src >> 2) & ~7);
            (&dst.x)[bidx ^ 2] = (uchar)((src >> 7) & ~7);
            setAlpha(dst, (uchar)(src & 0x8000 ? 255 : 0));
-
-            return dst;
        }
    };
-    template <int DSTCN> struct RGB5x52RGBConverter<6, DSTCN>
+    template <> struct RGB5x52RGBConverter<6>
    {
-        typedef typename TypeVec<uchar, DSTCN>::vec_t dst_t;
-
-        static __device__ dst_t cvt(uint src, int bidx)
+        template <typename D>
+        static __device__ void cvt(uint src, D& dst, int bidx)
        {            
-            dst_t dst;
-            
            (&dst.x)[bidx] = (uchar)(src << 3);
            dst.y = (uchar)((src >> 3) & ~3);
            (&dst.x)[bidx ^ 2] = (uchar)((src >> 8) & ~7);
            setAlpha(dst, (uchar)(255));
-
-            return dst;
        }
    };

-    template <int GREEN_BITS, int DSTCN>
-    __global__ void RGB5x52RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
+    template <int GREEN_BITS, int DSTCN> struct RGB5x52RGB
    {
+        typedef ushort src_t;
        typedef typename TypeVec<uchar, DSTCN>::vec_t dst_t;

-        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        explicit RGB5x52RGB(int bidx) : bidx(bidx) {}

-        if (y < rows && x < cols)
+        __device__ dst_t operator()(ushort src) const
        {
-            uint src = *(const ushort*)(src_ + y * src_step + (x << 1));
-            
-            *(dst_t*)(dst_ + y * dst_step + x * DSTCN) = RGB5x52RGBConverter<GREEN_BITS, DSTCN>::cvt(src, bidx);
-        }
+            dst_t dst;
+            RGB5x52RGBConverter<GREEN_BITS>::cvt((uint)src, dst, bidx);
+            return dst;
        }

-    template <int SRCCN, int GREEN_BITS> struct RGB2RGB5x5Converter {};
-    template<int SRCCN> struct RGB2RGB5x5Converter<SRCCN, 6> 
+    private:
+        int bidx;
+    };
+
+    template <int GREEN_BITS> struct RGB2RGB5x5Converter;
+    template<> struct RGB2RGB5x5Converter<6> 
    {
-        static __device__ ushort cvt(const uchar* src, int bidx)
+        template <typename T>
+        static __device__ ushort cvt(const T& src, int bidx)
        {
-            return (ushort)((src[bidx] >> 3) | ((src[1] & ~3) << 3) | ((src[bidx^2] & ~7) << 8));
+            return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~3) << 3) | (((&src.x)[bidx^2] & ~7) << 8));
        }
    };
-    template<> struct RGB2RGB5x5Converter<3, 5> 
+    template<> struct RGB2RGB5x5Converter<5> 
    {
-        static __device__ ushort cvt(const uchar* src, int bidx)
+        static __device__ ushort cvt(const uchar3& src, int bidx)
        {
-            return (ushort)((src[bidx] >> 3) | ((src[1] & ~7) << 2) | ((src[bidx^2] & ~7) << 7));
+            return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7));
        }
-    };
-    template<> struct RGB2RGB5x5Converter<4, 5> 
-    {
-        static __device__ ushort cvt(const uchar* src, int bidx)
+        static __device__ ushort cvt(const uchar4& src, int bidx)
        {
-            return (ushort)((src[bidx] >> 3) | ((src[1] & ~7) << 2) | ((src[bidx^2] & ~7) << 7) | (src[3] ? 0x8000 : 0));
+            return (ushort)(((&src.x)[bidx] >> 3) | ((src.y & ~7) << 2) | (((&src.x)[bidx^2] & ~7) << 7) | (src.w ? 0x8000 : 0));
        }
    };   

-    template<int SRCCN, int GREEN_BITS>
-    __global__ void RGB2RGB5x5(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
+    template<int SRCCN, int GREEN_BITS> struct RGB2RGB5x5
    {
        typedef typename TypeVec<uchar, SRCCN>::vec_t src_t;
+        typedef ushort dst_t;

-        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        explicit RGB2RGB5x5(int bidx) : bidx(bidx) {}

-        if (y < rows && x < cols)
+        __device__ ushort operator()(const src_t& src)
        {
-            src_t src = *(src_t*)(src_ + y * src_step + x * SRCCN);
-
-            *(ushort*)(dst_ + y * dst_step + (x << 1)) = RGB2RGB5x5Converter<SRCCN, GREEN_BITS>::cvt(&src.x, bidx);
-        }
+            return RGB2RGB5x5Converter<GREEN_BITS>::cvt(src, bidx);
        }

+    private:
+        int bidx;
+    };
+
    template <int GREEN_BITS, int DSTCN>
    void RGB5x52RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
-
-        grid.x = divUp(src.cols, threads.x);
-        grid.y = divUp(src.rows, threads.y);
-
-        RGB5x52RGB<GREEN_BITS, DSTCN><<<grid, threads, 0, stream>>>(src.data, src.step, 
-            dst.data, dst.step, src.rows, src.cols, bidx);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+        RGB5x52RGB<GREEN_BITS, DSTCN> cvt(bidx);
+        callConvert(src, dst, cvt, stream);
    }

    void RGB5x52RGB_gpu(const DevMem2D& src, int green_bits, const DevMem2D& dst, int dstcn, int bidx, cudaStream_t stream)
@@ -295,18 +276,8 @@ namespace cv { namespace gpu { namespace color
    template <int SRCCN, int GREEN_BITS>
    void RGB2RGB5x5_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
-
-        grid.x = divUp(src.cols, threads.x);
-        grid.y = divUp(src.rows, threads.y);
-
-        RGB2RGB5x5<SRCCN, GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step, 
-            dst.data, dst.step, src.rows, src.cols, bidx);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+        RGB2RGB5x5<SRCCN, GREEN_BITS> cvt(bidx);
+        callConvert(src, dst, cvt, stream);
    }

    void RGB2RGB5x5_gpu(const DevMem2D& src, int srccn, const DevMem2D& dst, int green_bits, int bidx, cudaStream_t stream)
@@ -323,27 +294,23 @@ namespace cv { namespace gpu { namespace color

 ///////////////////////////////// Grayscale to Color ////////////////////////////////

-    template <int DSTCN, typename T>
-    __global__ void Gray2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols)
+    template <int DSTCN, typename T> struct Gray2RGB
    {
+        typedef T src_t;
        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;

-		const int x = blockDim.x * blockIdx.x + threadIdx.x;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
-
-        if (y < rows && x < cols)
+        __device__ dst_t operator()(const T& src) const
        {
-            T src = *(const T*)(src_ + y * src_step + x * sizeof(T));
            dst_t dst;
-            dst.x = src;
-            dst.y = src;
-            dst.z = src;
+
+            dst.z = dst.y = dst.x = src;            
            setAlpha(dst, ColorChannel<T>::max());
-            *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
-        }
+
+            return dst;
        }
+    };

-    template <int GREEN_BITS> struct Gray2RGB5x5Converter {};
+    template <int GREEN_BITS> struct Gray2RGB5x5Converter;
    template<> struct Gray2RGB5x5Converter<6> 
    {
        static __device__ ushort cvt(uint t)
@@ -360,35 +327,22 @@ namespace cv { namespace gpu { namespace color
        }
    };

-    template<int GREEN_BITS>
-    __global__ void Gray2RGB5x5(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols)
+    template<int GREEN_BITS> struct Gray2RGB5x5
    {
-        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        typedef uchar src_t;
+        typedef ushort dst_t;

-        if (y < rows && x < cols)
+        __device__ ushort operator()(uchar src) const
        {
-            uint src = src_[y * src_step + x];
-
-            *(ushort*)(dst_ + y * dst_step + (x << 1)) = Gray2RGB5x5Converter<GREEN_BITS>::cvt(src);
-        }
+            return Gray2RGB5x5Converter<GREEN_BITS>::cvt((uint)src);
        }
+    };

    template <typename T, int DSTCN>
    void Gray2RGB_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
-
-        grid.x = divUp(src.cols, threads.x);
-        grid.y = divUp(src.rows, threads.y);
-
-        Gray2RGB<DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
-            dst.data, dst.step, src.rows, src.cols);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+        Gray2RGB<DSTCN, T> cvt;
+        callConvert(src, dst, cvt, stream);
    }

    void Gray2RGB_gpu_8u(const DevMem2D& src, const DevMem2D& dst, int dstcn, cudaStream_t stream)
@@ -418,18 +372,8 @@ namespace cv { namespace gpu { namespace color
    template <int GREEN_BITS>
    void Gray2RGB5x5_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
-
-        grid.x = divUp(src.cols, threads.x);
-        grid.y = divUp(src.rows, threads.y);
-
-        Gray2RGB5x5<GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step, 
-            dst.data, dst.step, src.rows, src.cols);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+        Gray2RGB5x5<GREEN_BITS> cvt;
+        callConvert(src, dst, cvt, stream);
    }

    void Gray2RGB5x5_gpu(const DevMem2D& src, const DevMem2D& dst, int green_bits, cudaStream_t stream)
@@ -459,7 +403,7 @@ namespace cv { namespace gpu { namespace color
        BLOCK_SIZE = 256
    };

-    template <int GREEN_BITS> struct RGB5x52GrayConverter {};
+    template <int GREEN_BITS> struct RGB5x52GrayConverter;
    template<> struct RGB5x52GrayConverter<6> 
    {
        static __device__ uchar cvt(uint t)
@@ -475,30 +419,23 @@ namespace cv { namespace gpu { namespace color
        }
    };   

-    template<int GREEN_BITS>
-    __global__ void RGB5x52Gray(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols)
+    template<int GREEN_BITS> struct RGB5x52Gray
    {
-        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        typedef ushort src_t;
+        typedef uchar dst_t;

-        if (y < rows && x < cols)
+        __device__ uchar operator()(ushort src) const
        {
-            uint src = *(ushort*)(src_ + y * src_step + (x << 1));
-
-            dst_[y * dst_step + x] = RGB5x52GrayConverter<GREEN_BITS>::cvt(src);
-        }
+            return RGB5x52GrayConverter<GREEN_BITS>::cvt((uint)src);
        }
+    };

-    template <typename T> struct RGB2GrayConvertor 
-    {
-        static __device__ T cvt(const T* src, int bidx)
+    template <typename T>
+    __device__ T RGB2GrayConvert(const T* src, int bidx)
    {
        return (T)CV_DESCALE((unsigned)(src[bidx] * B2Y + src[1] * G2Y + src[bidx^2] * R2Y), yuv_shift);
    }
-    };
-    template <> struct RGB2GrayConvertor<float> 
-    {
-        static __device__ float cvt(const float* src, int bidx)
+     __device__ float RGB2GrayConvert(const float* src, int bidx)
    {
        const float cr = 0.299f;
        const float cg = 0.587f;
@@ -506,39 +443,28 @@ namespace cv { namespace gpu { namespace color

        return src[bidx] * cb + src[1] * cg + src[bidx^2] * cr;
    }
-    };

-    template <int SRCCN, typename T>
-    __global__ void RGB2Gray(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
+    template <int SRCCN, typename T> struct RGB2Gray
    {
        typedef typename TypeVec<T, SRCCN>::vec_t src_t;
+        typedef T dst_t;

-        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-        const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        explicit RGB2Gray(int bidx) : bidx(bidx) {}

-        if (y < rows && x < cols)
+        __device__ T operator()(const src_t& src)
        {
-            src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));
-
-            *(T*)(dst_ + y * dst_step + x * sizeof(T)) = RGB2GrayConvertor<T>::cvt(&src.x, bidx);
-        }
+            return RGB2GrayConvert(&src.x, bidx);
        }

+    private:
+        int bidx;
+    }; 
+
    template <typename T, int SRCCN>
    void RGB2Gray_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
-
-        grid.x = divUp(src.cols, threads.x);
-        grid.y = divUp(src.rows, threads.y);
-
-        RGB2Gray<SRCCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
-            dst.data, dst.step, src.rows, src.cols, bidx);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+        RGB2Gray<SRCCN, T> cvt(bidx);
+        callConvert(src, dst, cvt, stream);
    }

    void RGB2Gray_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream)
@@ -568,18 +494,8 @@ namespace cv { namespace gpu { namespace color
    template <int GREEN_BITS>
    void RGB5x52Gray_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
-
-        grid.x = divUp(src.cols, threads.x);
-        grid.y = divUp(src.rows, threads.y);
-
-        RGB5x52Gray<GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step, 
-            dst.data, dst.step, src.rows, src.cols);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+        RGB5x52Gray<GREEN_BITS> cvt;
+        callConvert(src, dst, cvt, stream);
    }

    void RGB5x52Gray_gpu(const DevMem2D& src, int green_bits, const DevMem2D& dst, cudaStream_t stream)
@@ -595,13 +511,11 @@ namespace cv { namespace gpu { namespace color

 ///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////

-    __constant__ float cYCrCbCoeffs_f[5];
    __constant__ int cYCrCbCoeffs_i[5];
+    __constant__ float cYCrCbCoeffs_f[5];
    
-    template <typename T> struct RGB2YCrCbConverter 
-    {
-        template <typename D>
-        static __device__ void cvt(const T* src, D& dst, int bidx)
+    template <typename T, typename D>
+    __device__ void RGB2YCrCbConvert(const T* src, D& dst, int bidx)
    {
        const int delta = ColorChannel<T>::half() * (1 << yuv_shift);

@@ -613,42 +527,53 @@ namespace cv { namespace gpu { namespace color
        dst.y = saturate_cast<T>(Cr);
        dst.z = saturate_cast<T>(Cb);
    }
-    };
-    template<> struct RGB2YCrCbConverter<float>
-    {
    template <typename D>
-        static __device__ void cvt(const float* src, D& dst, int bidx)
+    static __device__ void RGB2YCrCbConvert(const float* src, D& dst, int bidx)
    {
        dst.x = src[0] * cYCrCbCoeffs_f[0] + src[1] * cYCrCbCoeffs_f[1] + src[2] * cYCrCbCoeffs_f[2];
        dst.y = (src[bidx^2] - dst.x) * cYCrCbCoeffs_f[3] + ColorChannel<float>::half();
        dst.z = (src[bidx] - dst.x) * cYCrCbCoeffs_f[4] + ColorChannel<float>::half();
    }
+
+    template<typename T> struct RGB2YCrCbBase
+    {
+        typedef int coeff_t;
+
+        explicit RGB2YCrCbBase(const coeff_t coeffs[5])
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) );
+        }
    };
+    template<> struct RGB2YCrCbBase<float>
+    {
+        typedef float coeff_t;

-    template <int SRCCN, int DSTCN, typename T>
-    __global__ void RGB2YCrCb(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
+        explicit RGB2YCrCbBase(const coeff_t coeffs[5])
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_f, coeffs, 5 * sizeof(float)) );
+        }
+    };
+    template <int SRCCN, int DSTCN, typename T> struct RGB2YCrCb : RGB2YCrCbBase<T>
    {
+        typedef typename RGB2YCrCbBase<T>::coeff_t coeff_t;
        typedef typename TypeVec<T, SRCCN>::vec_t src_t;
        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;

-		const int x = blockDim.x * blockIdx.x + threadIdx.x;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        RGB2YCrCb(int bidx, const coeff_t coeffs[5]) : RGB2YCrCbBase<T>(coeffs), bidx(bidx) {}

-        if (y < rows && x < cols)
+        __device__ dst_t operator()(const src_t& src) const
        {
-            src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));
            dst_t dst;
-
-            RGB2YCrCbConverter<T>::cvt(&src.x, dst, bidx);
-            
-            *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
-        }
+            RGB2YCrCbConvert(&src.x, dst, bidx);
+            return dst;
        }

-    template <typename D> struct YCrCb2RGBConvertor
-    {
-        template <typename T>
-        static __device__ void cvt(const T& src, D* dst, int bidx)
+    private:
+        int bidx;
+    };
+    
+    template <typename T, typename D>
+    __device__ void YCrCb2RGBConvert(const T& src, D* dst, int bidx)
    {
        const int b = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * cYCrCbCoeffs_i[3], yuv_shift);
        const int g = src.x + CV_DESCALE((src.z - ColorChannel<D>::half()) * cYCrCbCoeffs_i[2] + (src.y - ColorChannel<D>::half()) * cYCrCbCoeffs_i[1], yuv_shift);
@@ -658,372 +583,374 @@ namespace cv { namespace gpu { namespace color
        dst[1] = saturate_cast<D>(g);
        dst[bidx^2] = saturate_cast<D>(r);
    }
-    };
-    template <> struct YCrCb2RGBConvertor<float>
-    {
    template <typename T>
-        static __device__ void cvt(const T& src, float* dst, int bidx)
+    __device__ void YCrCb2RGBConvert(const T& src, float* dst, int bidx)
    {
        dst[bidx] = src.x + (src.z - ColorChannel<float>::half()) * cYCrCbCoeffs_f[3];
        dst[1] = src.x + (src.z - ColorChannel<float>::half()) * cYCrCbCoeffs_f[2] + (src.y - ColorChannel<float>::half()) * cYCrCbCoeffs_f[1];
        dst[bidx^2] = src.x + (src.y - ColorChannel<float>::half()) * cYCrCbCoeffs_f[0];
    }
+
+    template<typename T> struct YCrCb2RGBBase
+    {
+        typedef int coeff_t;
+
+        explicit YCrCb2RGBBase(const coeff_t coeffs[4])
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) );
+        }
    };
+    template<> struct YCrCb2RGBBase<float>
+    {
+        typedef float coeff_t;

-    template <int SRCCN, int DSTCN, typename T>
-    __global__ void YCrCb2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
+        explicit YCrCb2RGBBase(const coeff_t coeffs[4])
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_f, coeffs, 4 * sizeof(float)) );
+        }
+    };
+    template <int SRCCN, int DSTCN, typename T> struct YCrCb2RGB : YCrCb2RGBBase<T>
    {
+        typedef typename YCrCb2RGBBase<T>::coeff_t coeff_t;
        typedef typename TypeVec<T, SRCCN>::vec_t src_t;
        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;

-		const int x = blockDim.x * blockIdx.x + threadIdx.x;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        YCrCb2RGB(int bidx, const coeff_t coeffs[4]) : YCrCb2RGBBase<T>(coeffs), bidx(bidx) {}

-        if (y < rows && x < cols)
+        __device__ dst_t operator()(const src_t& src) const
        {
-            src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));
            dst_t dst;

-            YCrCb2RGBConvertor<T>::cvt(src, &dst.x, bidx);
+            YCrCb2RGBConvert(src, &dst.x, bidx);
            setAlpha(dst, ColorChannel<T>::max());

-            *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
-        }
+            return dst;
        }

+    private:
+        int bidx;
+    };
+
    template <typename T, int SRCCN, int DSTCN>
-    void RGB2YCrCb_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
+    void RGB2YCrCb_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream)
    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
-
-        grid.x = divUp(src.cols, threads.x);
-        grid.y = divUp(src.rows, threads.y);
-
-        RGB2YCrCb<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
-            dst.data, dst.step, src.rows, src.cols, bidx);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+        typedef typename RGB2YCrCb<SRCCN, DSTCN, T>::coeff_t coeff_t;
+        RGB2YCrCb<SRCCN, DSTCN, T> cvt(bidx, (const coeff_t*)coeffs);
+        callConvert(src, dst, cvt, stream);
    }

    void RGB2YCrCb_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)
    {
-        typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);
+        typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream);
        static const RGB2YCrCb_caller_t RGB2YCrCb_callers[2][2] = 
        {
            {RGB2YCrCb_caller<uchar, 3, 3>, RGB2YCrCb_caller<uchar, 3, 4>},
            {RGB2YCrCb_caller<uchar, 4, 3>, RGB2YCrCb_caller<uchar, 4, 4>}
        };

-        cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) );
-
-        RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
+        RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream);
    }

    void RGB2YCrCb_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)
    {
-        typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);
+        typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream);
        static const RGB2YCrCb_caller_t RGB2YCrCb_callers[2][2] = 
        {
            {RGB2YCrCb_caller<ushort, 3, 3>, RGB2YCrCb_caller<ushort, 3, 4>},
            {RGB2YCrCb_caller<ushort, 4, 3>, RGB2YCrCb_caller<ushort, 4, 4>}
        };

-        cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) );
-
-        RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
+        RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream);
    }

    void RGB2YCrCb_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)
    {
-        typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);
+        typedef void (*RGB2YCrCb_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream);
        static const RGB2YCrCb_caller_t RGB2YCrCb_callers[2][2] = 
        {
            {RGB2YCrCb_caller<float, 3, 3>, RGB2YCrCb_caller<float, 3, 4>},
            {RGB2YCrCb_caller<float, 4, 3>, RGB2YCrCb_caller<float, 4, 4>}
        };

-        cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_f, coeffs, 5 * sizeof(float)) );
-
-        RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
+        RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream);
    }
    
    template <typename T, int SRCCN, int DSTCN>
-    void YCrCb2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
+    void YCrCb2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream)
    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
-
-        grid.x = divUp(src.cols, threads.x);
-        grid.y = divUp(src.rows, threads.y);
-
-        YCrCb2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
-            dst.data, dst.step, src.rows, src.cols, bidx);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+        typedef typename YCrCb2RGB<SRCCN, DSTCN, T>::coeff_t coeff_t;
+        YCrCb2RGB<SRCCN, DSTCN, T> cvt(bidx, (const coeff_t*)coeffs);
+        callConvert(src, dst, cvt, stream);
    }

    void YCrCb2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)
    {
-        typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);
+        typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream);
        static const YCrCb2RGB_caller_t YCrCb2RGB_callers[2][2] = 
        {
            {YCrCb2RGB_caller<uchar, 3, 3>, YCrCb2RGB_caller<uchar, 3, 4>},
            {YCrCb2RGB_caller<uchar, 4, 3>, YCrCb2RGB_caller<uchar, 4, 4>}
        };

-        cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) );
-
-        YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
+        YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream);
    }

    void YCrCb2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)
    {
-        typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);
+        typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream);
        static const YCrCb2RGB_caller_t YCrCb2RGB_callers[2][2] = 
        {
            {YCrCb2RGB_caller<ushort, 3, 3>, YCrCb2RGB_caller<ushort, 3, 4>},
            {YCrCb2RGB_caller<ushort, 4, 3>, YCrCb2RGB_caller<ushort, 4, 4>}
        };
        
-        cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) );
-
-        YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
+        YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream);
    }

    void YCrCb2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, const void* coeffs, cudaStream_t stream)
    {
-        typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream);
+        typedef void (*YCrCb2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, int bidx, const void* coeffs, cudaStream_t stream);
        static const YCrCb2RGB_caller_t YCrCb2RGB_callers[2][2] = 
        {
            {YCrCb2RGB_caller<float, 3, 3>, YCrCb2RGB_caller<float, 3, 4>},
            {YCrCb2RGB_caller<float, 4, 3>, YCrCb2RGB_caller<float, 4, 4>}
        };
        
-        cudaSafeCall( cudaMemcpyToSymbol(cYCrCbCoeffs_f, coeffs, 4 * sizeof(float)) );
-
-        YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
+        YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, coeffs, stream);
    }

 ////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////

-    __constant__ float cXYZ_D65f[9];
    __constant__ int cXYZ_D65i[9];
+    __constant__ float cXYZ_D65f[9];

-    template <typename T> struct RGB2XYZConvertor
-    {
-        template <typename D>
-        static __device__ void cvt(const T* src, D& dst)
+    template <typename T, typename D>
+    __device__ void RGB2XYZConvert(const T* src, D& dst)
    {
        dst.x = saturate_cast<T>(CV_DESCALE(src[0] * cXYZ_D65i[0] + src[1] * cXYZ_D65i[1] + src[2] * cXYZ_D65i[2], xyz_shift));
        dst.y = saturate_cast<T>(CV_DESCALE(src[0] * cXYZ_D65i[3] + src[1] * cXYZ_D65i[4] + src[2] * cXYZ_D65i[5], xyz_shift));
        dst.z = saturate_cast<T>(CV_DESCALE(src[0] * cXYZ_D65i[6] + src[1] * cXYZ_D65i[7] + src[2] * cXYZ_D65i[8], xyz_shift));
    }
-    };
-    template <> struct RGB2XYZConvertor<float>
-    {
    template <typename D>
-        static __device__ void cvt(const float* src, D& dst)
+    __device__ void RGB2XYZConvert(const float* src, D& dst)
    {
        dst.x = src[0] * cXYZ_D65f[0] + src[1] * cXYZ_D65f[1] + src[2] * cXYZ_D65f[2];
        dst.y = src[0] * cXYZ_D65f[3] + src[1] * cXYZ_D65f[4] + src[2] * cXYZ_D65f[5];
        dst.z = src[0] * cXYZ_D65f[6] + src[1] * cXYZ_D65f[7] + src[2] * cXYZ_D65f[8];
    }
+
+    template <typename T> struct RGB2XYZBase
+    {
+        typedef int coeff_t;
+
+        explicit RGB2XYZBase(const coeff_t coeffs[9])
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) );
+        }
    };
+    template <> struct RGB2XYZBase<float>
+    {
+        typedef float coeff_t;

-    template <int SRCCN, int DSTCN, typename T>
-    __global__ void RGB2XYZ(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols)
+        explicit RGB2XYZBase(const coeff_t coeffs[9])
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65f, coeffs, 9 * sizeof(float)) );
+        }
+    };
+    template <int SRCCN, int DSTCN, typename T> struct RGB2XYZ : RGB2XYZBase<T>
    {
+        typedef typename RGB2XYZBase<T>::coeff_t coeff_t;
        typedef typename TypeVec<T, SRCCN>::vec_t src_t;
        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;

-		const int x = blockDim.x * blockIdx.x + threadIdx.x;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        explicit RGB2XYZ(const coeff_t coeffs[9]) : RGB2XYZBase<T>(coeffs) {}

-        if (y < rows && x < cols)
+        __device__ dst_t operator()(const src_t& src) const
        {
-            src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));
-
            dst_t dst;
-            RGB2XYZConvertor<T>::cvt(&src.x, dst);
-            
-            *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
-        }
+            RGB2XYZConvert(&src.x, dst);
+            return dst;
        }
+    };

-    template <typename D> struct XYZ2RGBConvertor
-    {
-        template <typename T>
-        static __device__ void cvt(const T& src, D* dst)
+    template <typename T, typename D>
+    __device__ void XYZ2RGBConvert(const T& src, D* dst)
    {
        dst[0] = saturate_cast<D>(CV_DESCALE(src.x * cXYZ_D65i[0] + src.y * cXYZ_D65i[1] + src.z * cXYZ_D65i[2], xyz_shift));
 	    dst[1] = saturate_cast<D>(CV_DESCALE(src.x * cXYZ_D65i[3] + src.y * cXYZ_D65i[4] + src.z * cXYZ_D65i[5], xyz_shift));
 	    dst[2] = saturate_cast<D>(CV_DESCALE(src.x * cXYZ_D65i[6] + src.y * cXYZ_D65i[7] + src.z * cXYZ_D65i[8], xyz_shift));
    }
-    };
-    template <> struct XYZ2RGBConvertor<float>
-    {
    template <typename T>
-        static __device__ void cvt(const T& src, float* dst)
+    __device__ void XYZ2RGBConvert(const T& src, float* dst)
    {
        dst[0] = src.x * cXYZ_D65f[0] + src.y * cXYZ_D65f[1] + src.z * cXYZ_D65f[2];
 	    dst[1] = src.x * cXYZ_D65f[3] + src.y * cXYZ_D65f[4] + src.z * cXYZ_D65f[5];
 	    dst[2] = src.x * cXYZ_D65f[6] + src.y * cXYZ_D65f[7] + src.z * cXYZ_D65f[8];
    }
+
+    template <typename T> struct XYZ2RGBBase
+    {
+        typedef int coeff_t;
+
+        explicit XYZ2RGBBase(const coeff_t coeffs[9])
+        {
+            cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) );
+        }
    };
+    template <> struct XYZ2RGBBase<float>
+    {
+        typedef float coeff_t;

-    template <int SRCCN, int DSTCN, typename T>
-    __global__ void XYZ2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols)
+        explicit XYZ2RGBBase(const coeff_t coeffs[9])
        {
+            cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65f, coeffs, 9 * sizeof(float)) );
+        }
+    };
+    template <int SRCCN, int DSTCN, typename T> struct XYZ2RGB : XYZ2RGBBase<T>
+    {
+        typedef typename RGB2XYZBase<T>::coeff_t coeff_t;
        typedef typename TypeVec<T, SRCCN>::vec_t src_t;
        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;
        
-		const int x = blockDim.x * blockIdx.x + threadIdx.x;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        explicit XYZ2RGB(const coeff_t coeffs[9]) : XYZ2RGBBase<T>(coeffs) {}

-        if (y < rows && x < cols)
+        __device__ dst_t operator()(const src_t& src) const
        {
-            src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));
-
            dst_t dst;
-            XYZ2RGBConvertor<T>::cvt(src, &dst.x);
+            XYZ2RGBConvert(src, &dst.x);
            setAlpha(dst, ColorChannel<T>::max());
-            
-            *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
-        }
+            return dst;
        }
+    };

    template <typename T, int SRCCN, int DSTCN>
-    void RGB2XYZ_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
+    void RGB2XYZ_caller(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream)
    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
-
-        grid.x = divUp(src.cols, threads.x);
-        grid.y = divUp(src.rows, threads.y);
-
-        RGB2XYZ<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
-            dst.data, dst.step, src.rows, src.cols);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+        typedef typename RGB2XYZ<SRCCN, DSTCN, T>::coeff_t coeff_t;
+        RGB2XYZ<SRCCN, DSTCN, T> cvt((const coeff_t*)coeffs);
+        callConvert(src, dst, cvt, stream);
    }

    void RGB2XYZ_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)
    {
-        typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+        typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream);
        static const RGB2XYZ_caller_t RGB2XYZ_callers[2][2] = 
        {
            {RGB2XYZ_caller<uchar, 3, 3>, RGB2XYZ_caller<uchar, 3, 4>},
            {RGB2XYZ_caller<uchar, 4, 3>, RGB2XYZ_caller<uchar, 4, 4>}
        };

-        cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) );
-
-        RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream);
+        RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, coeffs, stream);
    }

    void RGB2XYZ_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)
    {
-        typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+        typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream);
        static const RGB2XYZ_caller_t RGB2XYZ_callers[2][2] = 
        {
            {RGB2XYZ_caller<ushort, 3, 3>, RGB2XYZ_caller<ushort, 3, 4>},
            {RGB2XYZ_caller<ushort, 4, 3>, RGB2XYZ_caller<ushort, 4, 4>}
        };
        
-        cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) );
-
-        RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream);
+        RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, coeffs, stream);
    }

    void RGB2XYZ_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)
    {
-        typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+        typedef void (*RGB2XYZ_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream);
        static const RGB2XYZ_caller_t RGB2XYZ_callers[2][2] = 
        {
            {RGB2XYZ_caller<float, 3, 3>, RGB2XYZ_caller<float, 3, 4>},
            {RGB2XYZ_caller<float, 4, 3>, RGB2XYZ_caller<float, 4, 4>}
        };
        
-        cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65f, coeffs, 9 * sizeof(float)) );
-
-        RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream);
+        RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, coeffs, stream);
    }
    
    template <typename T, int SRCCN, int DSTCN>
-    void XYZ2RGB_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
+    void XYZ2RGB_caller(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream)
    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
-
-        grid.x = divUp(src.cols, threads.x);
-        grid.y = divUp(src.rows, threads.y);
-
-        XYZ2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
-            dst.data, dst.step, src.rows, src.cols);
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+        typedef typename XYZ2RGB<SRCCN, DSTCN, T>::coeff_t coeff_t;
+        XYZ2RGB<SRCCN, DSTCN, T> cvt((const coeff_t*)coeffs);
+        callConvert(src, dst, cvt, stream);
    }

    void XYZ2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)
    {
-        typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+        typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream);
        static const XYZ2RGB_caller_t XYZ2RGB_callers[2][2] = 
        {
            {XYZ2RGB_caller<uchar, 3, 3>, XYZ2RGB_caller<uchar, 3, 4>},
            {XYZ2RGB_caller<uchar, 4, 3>, XYZ2RGB_caller<uchar, 4, 4>}
        };

-        cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) );
-
-        XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream);
+        XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, coeffs, stream);
    }

    void XYZ2RGB_gpu_16u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)
    {
-        typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+        typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream);
        static const XYZ2RGB_caller_t XYZ2RGB_callers[2][2] = 
        {
            {XYZ2RGB_caller<ushort, 3, 3>, XYZ2RGB_caller<ushort, 3, 4>},
            {XYZ2RGB_caller<ushort, 4, 3>, XYZ2RGB_caller<ushort, 4, 4>}
        };

-        cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65i, coeffs, 9 * sizeof(int)) );
-
-        XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream);
+        XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, coeffs, stream);
    }

    void XYZ2RGB_gpu_32f(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, const void* coeffs, cudaStream_t stream)
    {
-        typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+        typedef void (*XYZ2RGB_caller_t)(const DevMem2D& src, const DevMem2D& dst, const void* coeffs, cudaStream_t stream);
        static const XYZ2RGB_caller_t XYZ2RGB_callers[2][2] = 
        {
            {XYZ2RGB_caller<float, 3, 3>, XYZ2RGB_caller<float, 3, 4>},
            {XYZ2RGB_caller<float, 4, 3>, XYZ2RGB_caller<float, 4, 4>}
        };
        
-        cudaSafeCall( cudaMemcpyToSymbol(cXYZ_D65f, coeffs, 9 * sizeof(float)) );
-
-        XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream);
+        XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, coeffs, stream);
    }

 ////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////

-    __constant__ int cHsvDivTable[256];
-
-    template<typename T, int HR> struct RGB2HSVConvertor;
-    template<int HR> struct RGB2HSVConvertor<uchar, HR>
+    __constant__ int cHsvDivTable[256] = 
    {
-        template <typename D>
-        static __device__ void cvt(const uchar* src, D& dst, int bidx)
+        0, 1044480, 522240, 348160, 261120, 208896, 174080, 149211,
+        130560, 116053, 104448, 94953, 87040, 80345, 74606, 69632,
+        65280, 61440, 58027, 54973, 52224, 49737, 47476, 45412,
+        43520, 41779, 40172, 38684, 37303, 36017, 34816, 33693,
+        32640, 31651, 30720, 29842, 29013, 28229, 27486, 26782,
+        26112, 25475, 24869, 24290, 23738, 23211, 22706, 22223,
+        21760, 21316, 20890, 20480, 20086, 19707, 19342, 18991,
+        18651, 18324, 18008, 17703, 17408, 17123, 16846, 16579,
+        16320, 16069, 15825, 15589, 15360, 15137, 14921, 14711,
+        14507, 14308, 14115, 13926, 13743, 13565, 13391, 13221,
+        13056, 12895, 12738, 12584, 12434, 12288, 12145, 12006,
+        11869, 11736, 11605, 11478, 11353, 11231, 11111, 10995,
+        10880, 10768, 10658, 10550, 10445, 10341, 10240, 10141,
+        10043, 9947, 9854, 9761, 9671, 9582, 9495, 9410,
+        9326, 9243, 9162, 9082, 9004, 8927, 8852, 8777,
+        8704, 8632, 8561, 8492, 8423, 8356, 8290, 8224,
+        8160, 8097, 8034, 7973, 7913, 7853, 7795, 7737,
+        7680, 7624, 7569, 7514, 7461, 7408, 7355, 7304,
+        7253, 7203, 7154, 7105, 7057, 7010, 6963, 6917,
+        6872, 6827, 6782, 6739, 6695, 6653, 6611, 6569,
+        6528, 6487, 6447, 6408, 6369, 6330, 6292, 6254,
+        6217, 6180, 6144, 6108, 6073, 6037, 6003, 5968,
+        5935, 5901, 5868, 5835, 5803, 5771, 5739, 5708,
+        5677, 5646, 5615, 5585, 5556, 5526, 5497, 5468,
+        5440, 5412, 5384, 5356, 5329, 5302, 5275, 5249,
+        5222, 5196, 5171, 5145, 5120, 5095, 5070, 5046,
+        5022, 4998, 4974, 4950, 4927, 4904, 4881, 4858,
+        4836, 4813, 4791, 4769, 4748, 4726, 4705, 4684,
+        4663, 4642, 4622, 4601, 4581, 4561, 4541, 4522,
+        4502, 4483, 4464, 4445, 4426, 4407, 4389, 4370,
+        4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229,
+        4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096
+    };
+    
+    template <int HR, typename D>
+    __device__ void RGB2HSVConvert(const uchar* src, D& dst, int bidx)
    {
        const int hsv_shift = 12;
        const int hscale = HR == 180 ? 15 : 21;
@@ -1051,11 +978,8 @@ namespace cv { namespace gpu { namespace color
        dst.y = (uchar)s;
        dst.z = (uchar)v;
    }
-    };
-    template<int HR> struct RGB2HSVConvertor<float, HR>
-    {
-        template <typename D>
-        static __device__ void cvt(const float* src, D& dst, int bidx)
+    template<int HR, typename D> 
+    __device__ void RGB2HSVConvert(const float* src, D& dst, int bidx)
    {
        const float hscale = HR * (1.f / 360.f);

@@ -1087,35 +1011,32 @@ namespace cv { namespace gpu { namespace color
        dst.y = s;
        dst.z = v;
    }
-    };

-    template <int SRCCN, int DSTCN, int HR, typename T>
-    __global__ void RGB2HSV(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
+    template <int SRCCN, int DSTCN, int HR, typename T> struct RGB2HSV
    {
        typedef typename TypeVec<T, SRCCN>::vec_t src_t;
        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;

-		const int x = blockDim.x * blockIdx.x + threadIdx.x;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        explicit RGB2HSV(int bidx) : bidx(bidx) {}

-        if (y < rows && x < cols)
+        __device__ dst_t operator()(const src_t& src) const
        {
-            src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));
-
            dst_t dst;
-            RGB2HSVConvertor<T, HR>::cvt(&src.x, dst, bidx);
-            
-            *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
-        }
+            RGB2HSVConvert<HR>(&src.x, dst, bidx);
+            return dst;
        }

-    __constant__ int cHsvSectorData[6][3];
+    private:
+        int bidx;
+    };

-    template<typename T, int HR> struct HSV2RGBConvertor;    
-    template<int HR> struct HSV2RGBConvertor<float, HR>
+    __constant__ int cHsvSectorData[6][3] = 
    {
-        template <typename T>
-        static __device__ void cvt(const T& src, float* dst, int bidx)
+        {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}
+    };
+
+    template <int HR, typename T>
+    __device__ void HSV2RGBConvert(const T& src, float* dst, int bidx)
    {
        const float hscale = 6.f / HR;
        
@@ -1150,11 +1071,8 @@ namespace cv { namespace gpu { namespace color
        dst[1] = g;
        dst[bidx^2] = r;
    }
-    };
-    template<int HR> struct HSV2RGBConvertor<uchar, HR>
-    {
-        template <typename T>
-        static __device__ void cvt(const T& src, uchar* dst, int bidx)
+    template <int HR, typename T>
+    __device__ void HSV2RGBConvert(const T& src, uchar* dst, int bidx)
    {
        float3 buf;

@@ -1162,55 +1080,45 @@ namespace cv { namespace gpu { namespace color
        buf.y = src.y * (1.f/255.f);
        buf.z = src.z * (1.f/255.f);

-            HSV2RGBConvertor<float, HR>::cvt(buf, &buf.x, bidx);
+        HSV2RGBConvert<HR>(buf, &buf.x, bidx);

        dst[0] = saturate_cast<uchar>(buf.x * 255.f);
        dst[1] = saturate_cast<uchar>(buf.y * 255.f);
        dst[2] = saturate_cast<uchar>(buf.z * 255.f);
    }
-    };

-    template <int SRCCN, int DSTCN, int HR, typename T>
-    __global__ void HSV2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
+    template <int SRCCN, int DSTCN, int HR, typename T> struct HSV2RGB
    {
        typedef typename TypeVec<T, SRCCN>::vec_t src_t;
        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;

-		const int x = blockDim.x * blockIdx.x + threadIdx.x;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        explicit HSV2RGB(int bidx) : bidx(bidx) {}

-        if (y < rows && x < cols)
+        __device__ dst_t operator()(const src_t& src) const
        {
-            src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));
-
            dst_t dst;
-            HSV2RGBConvertor<T, HR>::cvt(src, &dst.x, bidx);
+            HSV2RGBConvert<HR>(src, &dst.x, bidx);
            setAlpha(dst, ColorChannel<T>::max());
-            
-            *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
-        }
+            return dst;
        }

+    private:
+        int bidx;
+    };
+
    template <typename T, int SRCCN, int DSTCN>
    void RGB2HSV_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)
    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
-
-        grid.x = divUp(src.cols, threads.x);
-        grid.y = divUp(src.rows, threads.y);
-
        if (hrange == 180)
-            RGB2HSV<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
-                dst.data, dst.step, src.rows, src.cols, bidx);
+        {
+            RGB2HSV<SRCCN, DSTCN, 180, T> cvt(bidx);
+            callConvert(src, dst, cvt, stream);
+        }
        else
-            RGB2HSV<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
-                dst.data, dst.step, src.rows, src.cols, bidx);
-
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+        {
+            RGB2HSV<SRCCN, DSTCN, 255, T> cvt(bidx);
+            callConvert(src, dst, cvt, stream);
+        }
    }

    void RGB2HSV_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)
@@ -1222,43 +1130,6 @@ namespace cv { namespace gpu { namespace color
            {RGB2HSV_caller<uchar, 4, 3>, RGB2HSV_caller<uchar, 4, 4>}
        };

-        static const int div_table[] = 
-        {
-            0, 1044480, 522240, 348160, 261120, 208896, 174080, 149211,
-            130560, 116053, 104448, 94953, 87040, 80345, 74606, 69632,
-            65280, 61440, 58027, 54973, 52224, 49737, 47476, 45412,
-            43520, 41779, 40172, 38684, 37303, 36017, 34816, 33693,
-            32640, 31651, 30720, 29842, 29013, 28229, 27486, 26782,
-            26112, 25475, 24869, 24290, 23738, 23211, 22706, 22223,
-            21760, 21316, 20890, 20480, 20086, 19707, 19342, 18991,
-            18651, 18324, 18008, 17703, 17408, 17123, 16846, 16579,
-            16320, 16069, 15825, 15589, 15360, 15137, 14921, 14711,
-            14507, 14308, 14115, 13926, 13743, 13565, 13391, 13221,
-            13056, 12895, 12738, 12584, 12434, 12288, 12145, 12006,
-            11869, 11736, 11605, 11478, 11353, 11231, 11111, 10995,
-            10880, 10768, 10658, 10550, 10445, 10341, 10240, 10141,
-            10043, 9947, 9854, 9761, 9671, 9582, 9495, 9410,
-            9326, 9243, 9162, 9082, 9004, 8927, 8852, 8777,
-            8704, 8632, 8561, 8492, 8423, 8356, 8290, 8224,
-            8160, 8097, 8034, 7973, 7913, 7853, 7795, 7737,
-            7680, 7624, 7569, 7514, 7461, 7408, 7355, 7304,
-            7253, 7203, 7154, 7105, 7057, 7010, 6963, 6917,
-            6872, 6827, 6782, 6739, 6695, 6653, 6611, 6569,
-            6528, 6487, 6447, 6408, 6369, 6330, 6292, 6254,
-            6217, 6180, 6144, 6108, 6073, 6037, 6003, 5968,
-            5935, 5901, 5868, 5835, 5803, 5771, 5739, 5708,
-            5677, 5646, 5615, 5585, 5556, 5526, 5497, 5468,
-            5440, 5412, 5384, 5356, 5329, 5302, 5275, 5249,
-            5222, 5196, 5171, 5145, 5120, 5095, 5070, 5046,
-            5022, 4998, 4974, 4950, 4927, 4904, 4881, 4858,
-            4836, 4813, 4791, 4769, 4748, 4726, 4705, 4684,
-            4663, 4642, 4622, 4601, 4581, 4561, 4541, 4522,
-            4502, 4483, 4464, 4445, 4426, 4407, 4389, 4370,
-            4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229,
-            4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096
-        };
-        cudaSafeCall( cudaMemcpyToSymbol(cHsvDivTable, div_table, sizeof(div_table)) );
-
        RGB2HSV_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
    }

@@ -1274,27 +1145,19 @@ namespace cv { namespace gpu { namespace color
        RGB2HSV_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
    }
    
-    
    template <typename T, int SRCCN, int DSTCN>
    void HSV2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)
    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
-
-        grid.x = divUp(src.cols, threads.x);
-        grid.y = divUp(src.rows, threads.y);
-
        if (hrange == 180)
-            HSV2RGB<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
-                dst.data, dst.step, src.rows, src.cols, bidx);
+        {
+            HSV2RGB<SRCCN, DSTCN, 180, T> cvt(bidx);
+            callConvert(src, dst, cvt, stream);
+        }
        else
-            HSV2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
-                dst.data, dst.step, src.rows, src.cols, bidx);
-
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+        {
+            HSV2RGB<SRCCN, DSTCN, 255, T> cvt(bidx);
+            callConvert(src, dst, cvt, stream);
+        }
    }

    void HSV2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)
@@ -1306,11 +1169,6 @@ namespace cv { namespace gpu { namespace color
            {HSV2RGB_caller<uchar, 4, 3>, HSV2RGB_caller<uchar, 4, 4>}
        };

-        static const int sector_data[][3] =
-            {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
-
-        cudaSafeCall( cudaMemcpyToSymbol(cHsvSectorData, sector_data, sizeof(sector_data)) );
-
        HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
    }

@@ -1323,21 +1181,13 @@ namespace cv { namespace gpu { namespace color
            {HSV2RGB_caller<float, 4, 3>, HSV2RGB_caller<float, 4, 4>}
        };
        
-        static const int sector_data[][3] =
-            {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
-
-        cudaSafeCall( cudaMemcpyToSymbol(cHsvSectorData, sector_data, sizeof(sector_data)) );
-        
        HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
    }

 /////////////////////////////////////// RGB <-> HLS ////////////////////////////////////////

-    template<typename T, int HR> struct RGB2HLSConvertor;
-    template<int HR> struct RGB2HLSConvertor<float, HR>
-    {
-        template <typename D>
-        static __device__ void cvt(const float* src, D& dst, int bidx)
+    template <int HR, typename D>
+    __device__ void RGB2HLSConvert(const float* src, D& dst, int bidx)
    {
        const float hscale = HR * (1.f/360.f);

@@ -1373,11 +1223,8 @@ namespace cv { namespace gpu { namespace color
        dst.y = l;
        dst.z = s;
    }
-    };
-    template<int HR> struct RGB2HLSConvertor<uchar, HR>
-    {
-        template <typename D>
-        static __device__ void cvt(const uchar* src, D& dst, int bidx)
+    template <int HR, typename D>
+    __device__ void RGB2HLSConvert(const uchar* src, D& dst, int bidx)
    {
        float3 buf;

@@ -1385,41 +1232,38 @@ namespace cv { namespace gpu { namespace color
        buf.y = src[1]*(1.f/255.f);
        buf.z = src[2]*(1.f/255.f);

-            RGB2HLSConvertor<float, HR>::cvt(&buf.x, buf, bidx);
+        RGB2HLSConvert<HR>(&buf.x, buf, bidx);

        dst.x = saturate_cast<uchar>(buf.x);
        dst.y = saturate_cast<uchar>(buf.y*255.f);
        dst.z = saturate_cast<uchar>(buf.z*255.f);
    }
-    };

-    template <int SRCCN, int DSTCN, int HR, typename T>
-    __global__ void RGB2HLS(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
+    template <int SRCCN, int DSTCN, int HR, typename T> struct RGB2HLS
    {
        typedef typename TypeVec<T, SRCCN>::vec_t src_t;
        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;

-		const int x = blockDim.x * blockIdx.x + threadIdx.x;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        explicit RGB2HLS(int bidx) : bidx(bidx) {}

-        if (y < rows && x < cols)
+        __device__ dst_t operator()(const src_t& src) const
        {
-            src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));
-
            dst_t dst;
-            RGB2HLSConvertor<T, HR>::cvt(&src.x, dst, bidx);
-            
-            *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
-        }
+            RGB2HLSConvert<HR>(&src.x, dst, bidx);
+            return dst;
        }

-    __constant__ int cHlsSectorData[6][3];
+    private:
+        int bidx;
+    };
    
-    template<typename T, int HR> struct HLS2RGBConvertor;    
-    template<int HR> struct HLS2RGBConvertor<float, HR>
+    __constant__ int cHlsSectorData[6][3] = 
    {
-        template <typename T>
-        static __device__ void cvt(const T& src, float* dst, int bidx)
+        {1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}
+    };
+
+    template <int HR, typename T>
+    __device__ void HLS2RGBConvert(const T& src, float* dst, int bidx)
    {
        const float hscale = 6.0f / HR;

@@ -1460,11 +1304,8 @@ namespace cv { namespace gpu { namespace color
        dst[1] = g;
        dst[bidx^2] = r;
    }
-    };
-    template<int HR> struct HLS2RGBConvertor<uchar, HR>
-    {
-        template <typename T>
-        static __device__ void cvt(const T& src, uchar* dst, int bidx)
+    template <int HR, typename T>
+    __device__ void HLS2RGBConvert(const T& src, uchar* dst, int bidx)
    {
        float3 buf;

@@ -1472,53 +1313,45 @@ namespace cv { namespace gpu { namespace color
        buf.y = src.y*(1.f/255.f);
        buf.z = src.z*(1.f/255.f);

-            HLS2RGBConvertor<float, HR>::cvt(buf, &buf.x, bidx);
+        HLS2RGBConvert<HR>(buf, &buf.x, bidx);

        dst[0] = saturate_cast<uchar>(buf.x*255.f);
        dst[1] = saturate_cast<uchar>(buf.y*255.f);
        dst[2] = saturate_cast<uchar>(buf.z*255.f);
    }
-    };

-    template <int SRCCN, int DSTCN, int HR, typename T>
-    __global__ void HLS2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
+    template <int SRCCN, int DSTCN, int HR, typename T> struct HLS2RGB
    {
        typedef typename TypeVec<T, SRCCN>::vec_t src_t;
        typedef typename TypeVec<T, DSTCN>::vec_t dst_t;

-		const int x = blockDim.x * blockIdx.x + threadIdx.x;
-		const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        explicit HLS2RGB(int bidx) : bidx(bidx) {}

-        if (y < rows && x < cols)
+        __device__ dst_t operator()(const src_t& src) const
        {
-            src_t src = *(const src_t*)(src_ + y * src_step + x * SRCCN * sizeof(T));
-
            dst_t dst;
-            HLS2RGBConvertor<T, HR>::cvt(src, &dst.x, bidx);
+            HLS2RGBConvert<HR>(src, &dst.x, bidx);
            setAlpha(dst, ColorChannel<T>::max());
-            
-            *(dst_t*)(dst_ + y * dst_step + x * DSTCN * sizeof(T)) = dst;
-        }
+            return dst;
        }

+    private:
+        int bidx;
+    };
+
    template <typename T, int SRCCN, int DSTCN>
    void RGB2HLS_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)
    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
-
-        grid.x = divUp(src.cols, threads.x);
-        grid.y = divUp(src.rows, threads.y);
-
        if (hrange == 180)
-            RGB2HLS<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
-                dst.data, dst.step, src.rows, src.cols, bidx);
+        {
+            RGB2HLS<SRCCN, DSTCN, 180, T> cvt(bidx);
+            callConvert(src, dst, cvt, stream);
+        }
        else
-            RGB2HLS<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
-                dst.data, dst.step, src.rows, src.cols, bidx);
-
-        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+        {
+            RGB2HLS<SRCCN, DSTCN, 255, T> cvt(bidx);
+            callConvert(src, dst, cvt, stream);
+        }
    }

    void RGB2HLS_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)
@@ -1549,23 +1382,16 @@ namespace cv { namespace gpu { namespace color
    template <typename T, int SRCCN, int DSTCN>
    void HLS2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)
    {
-        dim3 threads(32, 8, 1);
-        dim3 grid(1, 1, 1);
-
-        grid.x = divUp(src.cols, threads.x);
-        grid.y = divUp(src.rows, threads.y);
-
        if (hrange == 180)
-            HLS2RGB<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
-                dst.data, dst.step, src.rows, src.cols, bidx);
+        {
+            HLS2RGB<SRCCN, DSTCN, 180, T> cvt(bidx);
+            callConvert(src, dst, cvt, stream);
+        }
        else
-            HLS2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step, 
-                dst.data, dst.step, src.rows, src.cols, bidx);
-
-        cudaSafeCall( cudaGetLastError() );
-
-        if (stream == 0)
-            cudaSafeCall( cudaThreadSynchronize() );
+        {
+            HLS2RGB<SRCCN, DSTCN, 255, T> cvt(bidx);
+            callConvert(src, dst, cvt, stream);
+        }
    }

    void HLS2RGB_gpu_8u(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, int hrange, cudaStream_t stream)
@@ -1577,11 +1403,6 @@ namespace cv { namespace gpu { namespace color
            {HLS2RGB_caller<uchar, 4, 3>, HLS2RGB_caller<uchar, 4, 4>}
        };

-        static const int sector_data[][3]=
-            {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
-
-        cudaSafeCall( cudaMemcpyToSymbol(cHlsSectorData, sector_data, sizeof(sector_data)) );
-
        HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
    }

@@ -1594,11 +1415,6 @@ namespace cv { namespace gpu { namespace color
            {HLS2RGB_caller<float, 4, 3>, HLS2RGB_caller<float, 4, 4>}
        };
                
-        static const int sector_data[][3]=
-            {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
-
-        cudaSafeCall( cudaMemcpyToSymbol(cHlsSectorData, sector_data, sizeof(sector_data)) );
-                
        HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
    }
 }}}
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@@ -190,6 +190,9 @@ void cv::gpu::Stream::enqueueCopy(const GpuMat& src, GpuMat& dst) { devcopy(src,

 void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val)
 {
+    CV_Assert((src.depth() != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, cudaStream_t stream);
    static const set_caller_t set_callers[] =
    {
@@ -201,6 +204,11 @@ void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val)

 void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)
 {
+    CV_Assert((src.depth() != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
+    CV_Assert(mask.type() == CV_8UC1);
+
    typedef void (*set_caller_t)(GpuMat& src, const Scalar& s, const GpuMat& mask, cudaStream_t stream);
    static const set_caller_t set_callers[] =
    {
@@ -212,6 +220,9 @@ void cv::gpu::Stream::enqueueMemSet(GpuMat& src, Scalar val, const GpuMat& mask)

 void cv::gpu::Stream::enqueueConvert(const GpuMat& src, GpuMat& dst, int rtype, double alpha, double beta)
 {
+    CV_Assert((src.depth() != CV_64F && CV_MAT_DEPTH(rtype) != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    bool noScale = fabs(alpha-1) < std::numeric_limits<double>::epsilon() && fabs(beta) < std::numeric_limits<double>::epsilon();

    if( rtype < 0 )

--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
@@ -626,6 +626,10 @@ namespace

 void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) 
 {
+    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+    CV_Assert((src1.depth() != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@@ -637,6 +641,10 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)

 void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream) 
 { 
+    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+    CV_Assert((src1.depth() != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@@ -648,6 +656,9 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Str

 void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst) 
 {
+    CV_Assert((src1.depth() != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@@ -659,6 +670,9 @@ void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst)

 void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream) 
 {
+    CV_Assert((src1.depth() != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@@ -670,6 +684,10 @@ void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& st

 void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst) 
 { 
+    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+    CV_Assert((src1.depth() != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@@ -681,6 +699,10 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)

 void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Stream& stream) 
 { 
+    CV_Assert(src1.size() == src2.size() && src1.type() == src2.type());
+    CV_Assert((src1.depth() != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    typedef void (*func_t)(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@@ -692,6 +714,9 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Str

 void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst) 
 {
+    CV_Assert((src1.depth() != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@@ -703,6 +728,9 @@ void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst)

 void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& stream) 
 {
+    CV_Assert((src1.depth() != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    typedef void (*func_t)(const GpuMat& src1, double src2, GpuMat& dst, cudaStream_t stream);
    static const func_t funcs[] = 
    {
@@ -749,6 +777,9 @@ double cv::gpu::threshold(const GpuMat& src, GpuMat& dst, double thresh, double
    }
    else
    {
+        CV_Assert((src.depth() != CV_64F) || 
+            (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
        typedef void (*caller_t)(const GpuMat& src, GpuMat& dst, double thresh, double maxVal, int type, 
            cudaStream_t stream);


--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/gpu/src/matrix_operations.cpp
@@ -205,6 +205,9 @@ namespace

 void cv::gpu::GpuMat::convertTo( GpuMat& dst, int rtype, double alpha, double beta ) const
 {
+    CV_Assert((depth() != CV_64F && CV_MAT_DEPTH(rtype) != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    bool noScale = fabs(alpha-1) < std::numeric_limits<double>::epsilon() && fabs(beta) < std::numeric_limits<double>::epsilon();

    if( rtype < 0 )
@@ -428,6 +431,9 @@ GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask)
 {
    CV_Assert(mask.type() == CV_8UC1);

+    CV_Assert((depth() != CV_64F) || 
+        (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE)));
+
    CV_DbgAssert(!this->empty());

    NppiSize sz;

--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
@@ -393,11 +393,37 @@ namespace cv
            }
        };

+        template <typename T, typename D, int scn, int dcn> struct UseSmartUn_
+        {
+            static const bool value = false;
+        };
+        template <typename T, typename D> struct UseSmartUn_<T, D, 1, 1>
+        {
+            static const bool value = device::UnReadWriteTraits<T, D>::shift != 1;
+        };
+        template <typename T, typename D> struct UseSmartUn
+        {
+            static const bool value = UseSmartUn_<T, D, device::VecTraits<T>::cn, device::VecTraits<D>::cn>::value;
+        };
+
+        template <typename T1, typename T2, typename D, int src1cn, int src2cn, int dstcn> struct UseSmartBin_
+        {
+            static const bool value = false;
+        };
+        template <typename T1, typename T2, typename D> struct UseSmartBin_<T1, T2, D, 1, 1, 1>
+        {
+            static const bool value = device::BinReadWriteTraits<T1, T2, D>::shift != 1;
+        };
+        template <typename T1, typename T2, typename D> struct UseSmartBin
+        {
+            static const bool value = UseSmartBin_<T1, T2, D, device::VecTraits<T1>::cn, device::VecTraits<T2>::cn, device::VecTraits<D>::cn>::value;
+        };
+
        template <typename T, typename D, typename UnOp, typename Mask>
        static void transform_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, const Mask& mask, 
            cudaStream_t stream = 0)
        {
-            TransformDispatcher<device::VecTraits<T>::cn == 1 && device::VecTraits<D>::cn == 1 && device::UnReadWriteTraits<T, D>::shift != 1>::call(src, dst, op, mask, stream);
+            TransformDispatcher< UseSmartUn<T, D>::value >::call(src, dst, op, mask, stream);
        }

        template <typename T, typename D, typename UnOp>
@@ -416,7 +442,7 @@ namespace cv
        static void transform_caller(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, 
            BinOp op, const Mask& mask, cudaStream_t stream = 0)
        {
-            TransformDispatcher<device::VecTraits<T1>::cn == 1 && device::VecTraits<T2>::cn == 1 && device::VecTraits<D>::cn == 1 && device::BinReadWriteTraits<T1, T2, D>::shift != 1>::call(src1, src2, dst, op, mask, stream);
+            TransformDispatcher< UseSmartBin<T1, T2, D>::value >::call(src1, src2, dst, op, mask, stream);
        }

        template <typename T1, typename T2, typename D, typename BinOp>

--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -681,3 +681,66 @@ TEST(erode)
        GPU_OFF;
    }
 }
+
+TEST(threshold)
+{
+    Mat src, dst;
+    gpu::GpuMat d_src, d_dst;
+
+    for (int size = 2000; size <= 4000; size += 1000)
+    {
+        SUBTEST << "size " << size << ", 8U, THRESH_TRUNC";
+
+        gen(src, size, size, CV_8U, 0, 100);
+        dst.create(size, size, CV_8U);
+
+        CPU_ON; 
+        threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
+        CPU_OFF;
+
+        d_src = src;
+        d_dst.create(size, size, CV_8U);
+
+        GPU_ON;
+        gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
+        GPU_OFF;
+    }
+
+    for (int size = 2000; size <= 4000; size += 1000)
+    {
+        SUBTEST << "size " << size << ", 8U, THRESH_BINARY";
+
+        gen(src, size, size, CV_8U, 0, 100);
+        dst.create(size, size, CV_8U);
+
+        CPU_ON; 
+        threshold(src, dst, 50.0, 0.0, THRESH_BINARY);
+        CPU_OFF;
+
+        d_src = src;
+        d_dst.create(size, size, CV_8U);
+
+        GPU_ON;
+        gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_BINARY);
+        GPU_OFF;
+    }
+
+    for (int size = 2000; size <= 4000; size += 1000)
+    {
+        SUBTEST << "size " << size << ", 32F, THRESH_TRUNC";
+
+        gen(src, size, size, CV_32F, 0, 100);
+        dst.create(size, size, CV_32F);
+
+        CPU_ON; 
+        threshold(src, dst, 50.0, 0.0, THRESH_TRUNC);
+        CPU_OFF;
+
+        d_src = src;
+        d_dst.create(size, size, CV_32F);
+
+        GPU_ON;
+        gpu::threshold(d_src, d_dst, 50.0, 0.0, THRESH_TRUNC);
+        GPU_OFF;
+    }
+}
\ No newline at end of file
--- a/tests/gpu/src/brute_force_matcher.cpp
+++ b/tests/gpu/src/brute_force_matcher.cpp
@@ -384,7 +384,7 @@ void CV_GpuBruteForceMatcherTest::knnMatchTest( const GpuMat& query, const GpuMa

 void CV_GpuBruteForceMatcherTest::radiusMatchTest( const GpuMat& query, const GpuMat& train )
 {
-    bool atomics_ok = TargetArchs::builtWith(ATOMICS) && DeviceInfo().supports(ATOMICS);
+    bool atomics_ok = TargetArchs::builtWith(GLOBAL_ATOMICS) && DeviceInfo().supports(GLOBAL_ATOMICS);
    if (!atomics_ok)
    {
        ts->printf(CvTS::CONSOLE, "\nCode and device atomics support is required for radiusMatch (CC >= 1.1)");

--- a/tests/gpu/src/meanshift.cpp
+++ b/tests/gpu/src/meanshift.cpp
@@ -53,7 +53,7 @@ struct CV_GpuMeanShiftTest : public CvTest

    void run(int)
    {
-        bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
+        bool cc12_ok = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
        if (!cc12_ok)
        {
            ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");
@@ -67,8 +67,8 @@ struct CV_GpuMeanShiftTest : public CvTest
        cv::Mat img = cv::imread(std::string(ts->get_data_path()) + "meanshift/cones.png");
        cv::Mat img_template;       
        
-        if (cv::gpu::TargetArchs::builtWith(cv::gpu::COMPUTE_20) &&
-            cv::gpu::DeviceInfo().supports(cv::gpu::COMPUTE_20))
+        if (cv::gpu::TargetArchs::builtWith(cv::gpu::FEATURE_SET_COMPUTE_20) &&
+            cv::gpu::DeviceInfo().supports(cv::gpu::FEATURE_SET_COMPUTE_20))
            img_template = cv::imread(std::string(ts->get_data_path()) + "meanshift/con_result.png");
        else
            img_template = cv::imread(std::string(ts->get_data_path()) + "meanshift/con_result_CC1X.png");
@@ -145,7 +145,7 @@ struct CV_GpuMeanShiftProcTest : public CvTest

    void run(int)
    {
-        bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
+        bool cc12_ok = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
        if (!cc12_ok)
        {
            ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");
@@ -219,8 +219,8 @@ struct CV_GpuMeanShiftProcTest : public CvTest
            cv::Mat spmap_template;
            cv::FileStorage fs;

-            if (cv::gpu::TargetArchs::builtWith(cv::gpu::COMPUTE_20) &&
-                cv::gpu::DeviceInfo().supports(cv::gpu::COMPUTE_20))
+            if (cv::gpu::TargetArchs::builtWith(cv::gpu::FEATURE_SET_COMPUTE_20) &&
+                cv::gpu::DeviceInfo().supports(cv::gpu::FEATURE_SET_COMPUTE_20))
                fs.open(std::string(ts->get_data_path()) + "meanshift/spmap.yaml", cv::FileStorage::READ);
            else
                fs.open(std::string(ts->get_data_path()) + "meanshift/spmap_CC1X.yaml", cv::FileStorage::READ);

--- a/tests/gpu/src/mssegmentation.cpp
+++ b/tests/gpu/src/mssegmentation.cpp
@@ -54,7 +54,7 @@ struct CV_GpuMeanShiftSegmentationTest : public CvTest {
    {
        try 
        {
-            bool cc12_ok = TargetArchs::builtWith(COMPUTE_12) && DeviceInfo().supports(COMPUTE_12);
+            bool cc12_ok = TargetArchs::builtWith(FEATURE_SET_COMPUTE_12) && DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
            if (!cc12_ok)
            {
                ts->printf(CvTS::CONSOLE, "\nCompute capability 1.2 is required");
@@ -77,7 +77,7 @@ struct CV_GpuMeanShiftSegmentationTest : public CvTest {
            {
                stringstream path;
                path << ts->get_data_path() << "meanshift/cones_segmented_sp10_sr10_minsize" << minsize;
-                if (TargetArchs::builtWith(COMPUTE_20) && DeviceInfo().supports(COMPUTE_20))
+                if (TargetArchs::builtWith(FEATURE_SET_COMPUTE_20) && DeviceInfo().supports(FEATURE_SET_COMPUTE_20))
                    path << ".png";
                else
                    path << "_CC1X.png";

--- a/tests/gpu/src/operator_convert_to.cpp
+++ b/tests/gpu/src/operator_convert_to.cpp
@@ -66,21 +66,24 @@ void CV_GpuMatOpConvertToTest::run(int /* start_from */)
 {
    const Size img_size(67, 35);

-    const int types[] = {CV_8U, CV_8S, CV_16U, CV_16S, CV_32S, CV_32F, CV_64F};
-    const int types_num = sizeof(types) / sizeof(int);
    const char* types_str[] = {"CV_8U", "CV_8S", "CV_16U", "CV_16S", "CV_32S", "CV_32F", "CV_64F"};

    bool passed = true;
    try
    {
-        for (int i = 0; i < types_num && passed; ++i)
+        int lastType = CV_32F;
+
+        if (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))
+            lastType = CV_64F;
+
+        for (int i = 0; i <= lastType && passed; ++i)
        {
-            for (int j = 0; j < types_num && passed; ++j)
+            for (int j = 0; j <= lastType && passed; ++j)
            {
                for (int c = 1; c < 5 && passed; ++c)
                {
-                    const int src_type = CV_MAKETYPE(types[i], c);
-                    const int dst_type = types[j];
+                    const int src_type = CV_MAKETYPE(i, c);
+                    const int dst_type = j;

                    cv::RNG rng(*ts->get_rng());


--- a/tests/gpu/src/operator_copy_to.cpp
+++ b/tests/gpu/src/operator_copy_to.cpp
@@ -126,7 +126,12 @@ void CV_GpuMatOpCopyToTest::run( int /* start_from */)

    try
    {
-        for (int i = 0 ; i < 7; i++)
+        int lastType = CV_32F;
+
+        if (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))
+            lastType = CV_64F;
+
+        for (int i = 0 ; i <= lastType; i++)
        {
            Mat cpumat(rows, cols, i);
            cpumat.setTo(Scalar::all(127));

--- a/tests/gpu/src/operator_set_to.cpp
+++ b/tests/gpu/src/operator_set_to.cpp
@@ -101,7 +101,12 @@ void CV_GpuMatOpSetToTest::run( int /* start_from */)
        rng.fill(cpumask, RNG::UNIFORM, cv::Scalar::all(0.0), cv::Scalar(1.5));
        cv::gpu::GpuMat gpumask(cpumask);

-        for (int i = 0; i < 7; i++)
+        int lastType = CV_32F;
+
+        if (TargetArchs::builtWith(NATIVE_DOUBLE) && DeviceInfo().supports(NATIVE_DOUBLE))
+            lastType = CV_64F;
+
+        for (int i = 0; i <= lastType; i++)
        {
            for (int cn = 1; cn <= 4; ++cn)
            {