added to gpu module linear filters for int and float source types.

refactored gpu module.

added to gpu module linear filters for int and float source types.
refactored gpu module.
b08f6082 · Vladislav Vinogradov · ea040ce7 · b08f6082 · b08f6082 · b08f6082
Commit b08f6082 authored Oct 20, 2010 by Vladislav Vinogradov
18 changed files
--- a/modules/gpu/include/opencv2/gpu/devmem2d.hpp
+++ b/modules/gpu/include/opencv2/gpu/devmem2d.hpp
@@ -50,7 +50,7 @@ namespace cv
        // Simple lightweight structure that encapsulates image ptr on device, its pitch and its sizes.
        // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile
-        template<typename T = unsigned char>
+        template <typename T>
        struct DevMem2D_
        {
            typedef T elem_t;
@@ -60,16 +60,21 @@ namespace cv
            int rows;
            T* ptr;
            size_t step;
+            size_t elem_step;
-            DevMem2D_() : cols(0), rows(0), ptr(0), step(0) {}
+            DevMem2D_() : cols(0), rows(0), ptr(0), step(0), elem_step(0) {}
            DevMem2D_(int rows_, int cols_, T *ptr_, size_t step_)
-                : cols(cols_), rows(rows_), ptr(ptr_), step(step_) {}
+                : cols(cols_), rows(rows_), ptr(ptr_), step(step_), elem_step(step_ / sizeof(T)) {}
+            template <typename U>
+            explicit DevMem2D_(const DevMem2D_<U>& d)
+                : cols(d.cols), rows(d.rows), ptr((T*)d.ptr), step(d.step), elem_step(d.step / sizeof(T)) {}
            size_t elemSize() const { return elem_size; }
        };
-        typedef DevMem2D_<> DevMem2D;
+        typedef DevMem2D_<unsigned char> DevMem2D;
        typedef DevMem2D_<float> DevMem2Df;
        typedef DevMem2D_<int> DevMem2Di;
    }

--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -636,7 +636,7 @@ namespace cv
        //! returns the separable filter engine with the specified filters
        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter, 
-            const Ptr<BaseColumnFilter_GPU>& columnFilter, bool rowFilterFirst = true);
+            const Ptr<BaseColumnFilter_GPU>& columnFilter);
        //! returns horizontal 1D box filter
        //! supports only CV_8UC1 source type and CV_32FC1 sum type
@@ -658,7 +658,7 @@ namespace cv
        //! only MORPH_ERODE and MORPH_DILATE are supported
        //! supports CV_8UC1 and CV_8UC4 types
        //! kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height
-        CV_EXPORTS Ptr<BaseFilter_GPU> getMorphologyFilter_GPU(int op, int type, const GpuMat& kernel, const Size& ksize, 
+        CV_EXPORTS Ptr<BaseFilter_GPU> getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize, 
            Point anchor=Point(-1,-1));
        //! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported.
@@ -667,25 +667,24 @@ namespace cv
        //! returns 2D filter with the specified kernel
        //! supports CV_8UC1 and CV_8UC4 types
-        //! kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height
+        CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Size& ksize, 
-        CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const GpuMat& kernel, const Size& ksize, 
+            Point anchor = Point(-1, -1));
-            Point anchor = Point(-1, -1), int nDivisor = 1);
        //! returns the non-separable linear filter engine
        CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, 
            const Point& anchor = Point(-1,-1));
        //! returns the primitive row filter with the specified kernel
-        CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const GpuMat& rowKernel, 
+        CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, 
-            int anchor = -1, int nDivisor = 1);
+            int anchor = -1);
        //! returns the primitive column filter with the specified kernel
-        CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const GpuMat& columnKernel, 
+        CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, 
-            int anchor = -1, int nDivisor = 1);
+            int anchor = -1);
        //! returns the separable linear filter engine
        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, 
-            const Mat& columnKernel, const Point& anchor = Point(-1,-1), bool rowFilterFirst = true);
+            const Mat& columnKernel, const Point& anchor = Point(-1,-1));
        //! returns filter engine for the generalized Sobel operator
        CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize);
@@ -720,7 +719,7 @@ namespace cv
        //! applies separable 2D linear filter to the image
        CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, 
-            Point anchor = Point(-1,-1), bool rowFilterFirst = true);
+            Point anchor = Point(-1,-1));
        //! applies generalized Sobel operator to the image
        CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1);

--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -316,9 +316,9 @@ void cv::gpu::absdiff(const GpuMat& src, const Scalar& s, GpuMat& dst)
 ////////////////////////////////////////////////////////////////////////
 // compare
-namespace cv { namespace gpu { namespace matrix_operations
+namespace cv { namespace gpu { namespace mathfunc
 {
-    void compare_ne_8u(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst);
+    void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst);
    void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst);
 }}}
@@ -346,7 +346,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
        }
        else
        {
-            matrix_operations::compare_ne_8u(src1, src2, dst);
+            mathfunc::compare_ne_8uc4(src1, src2, dst);
        }
    }
    else
@@ -359,7 +359,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
        }
        else
        {
-            matrix_operations::compare_ne_32f(src1, src2, dst);
+            mathfunc::compare_ne_32f(src1, src2, dst);
        }
    }
 }

--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@@ -42,6 +42,7 @@
 #include "cuda_shared.hpp"
 #include "saturate_cast.hpp"
+#include "vecmath.hpp"
 using namespace cv::gpu;
@@ -53,16 +54,8 @@ using namespace cv::gpu;
 #define FLT_EPSILON     1.192092896e-07F
 #endif
-namespace imgproc
+namespace imgproc_krnls
 {
-    template<typename T, int N> struct TypeVec {};
-    template<> struct TypeVec<uchar, 3> { typedef uchar3 vec_t; };
-    template<> struct TypeVec<uchar, 4> { typedef uchar4 vec_t; };
-    template<> struct TypeVec<ushort, 3> { typedef ushort3 vec_t; };
-    template<> struct TypeVec<ushort, 4> { typedef ushort4 vec_t; };
-    template<> struct TypeVec<float, 3> { typedef float3 vec_t; };
-    template<> struct TypeVec<float, 4> { typedef float4 vec_t; };
    template<typename T> struct ColorChannel {};
    template<> struct ColorChannel<uchar>
    {
@@ -106,7 +99,7 @@ namespace imgproc
 ////////////////// Various 3/4-channel to 3/4-channel RGB transformations /////////////////
-namespace imgproc
+namespace imgproc_krnls
 {
    template <int SRCCN, int DSTCN, typename T>
    __global__ void RGB2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols, int bidx)
@@ -132,7 +125,7 @@ namespace imgproc
    }
 }
-namespace cv { namespace gpu { namespace improc
+namespace cv { namespace gpu { namespace imgproc
 {
    template <typename T, int SRCCN, int DSTCN>
    void RGB2RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
@@ -143,7 +136,7 @@ namespace cv { namespace gpu { namespace improc
        grid.x = divUp(src.cols, threads.x);
        grid.y = divUp(src.rows, threads.y);
-        imgproc::RGB2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+        imgproc_krnls::RGB2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
            dst.ptr, dst.step, src.rows, src.cols, bidx);
        if (stream == 0)
@@ -189,7 +182,7 @@ namespace cv { namespace gpu { namespace improc
 /////////// Transforming 16-bit (565 or 555) RGB to/from 24/32-bit (888[8]) RGB //////////
-namespace imgproc
+namespace imgproc_krnls
 {
    template <int GREEN_BITS, int DSTCN> struct RGB5x52RGBConverter {};    
    template <int DSTCN> struct RGB5x52RGBConverter<5, DSTCN>
@@ -281,7 +274,7 @@ namespace imgproc
    }
 }
-namespace cv { namespace gpu { namespace improc
+namespace cv { namespace gpu { namespace imgproc
 {
    template <int GREEN_BITS, int DSTCN>
    void RGB5x52RGB_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
@@ -292,7 +285,7 @@ namespace cv { namespace gpu { namespace improc
        grid.x = divUp(src.cols, threads.x);
        grid.y = divUp(src.rows, threads.y);
-        imgproc::RGB5x52RGB<GREEN_BITS, DSTCN><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+        imgproc_krnls::RGB5x52RGB<GREEN_BITS, DSTCN><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
            dst.ptr, dst.step, src.rows, src.cols, bidx);
        if (stream == 0)
@@ -320,7 +313,7 @@ namespace cv { namespace gpu { namespace improc
        grid.x = divUp(src.cols, threads.x);
        grid.y = divUp(src.rows, threads.y);
-        imgproc::RGB2RGB5x5<SRCCN, GREEN_BITS><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+        imgproc_krnls::RGB2RGB5x5<SRCCN, GREEN_BITS><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
            dst.ptr, dst.step, src.rows, src.cols, bidx);
        if (stream == 0)
@@ -342,7 +335,7 @@ namespace cv { namespace gpu { namespace improc
 ///////////////////////////////// Grayscale to Color ////////////////////////////////
-namespace imgproc
+namespace imgproc_krnls
 {
    template <int DSTCN, typename T>
    __global__ void Gray2RGB(const uchar* src_, size_t src_step, uchar* dst_, size_t dst_step, int rows, int cols)
@@ -396,7 +389,7 @@ namespace imgproc
    }
 }
-namespace cv { namespace gpu { namespace improc
+namespace cv { namespace gpu { namespace imgproc
 {
    template <typename T, int DSTCN>
    void Gray2RGB_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
@@ -407,7 +400,7 @@ namespace cv { namespace gpu { namespace improc
        grid.x = divUp(src.cols, threads.x);
        grid.y = divUp(src.rows, threads.y);
-        imgproc::Gray2RGB<DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+        imgproc_krnls::Gray2RGB<DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
            dst.ptr, dst.step, src.rows, src.cols);
        if (stream == 0)
@@ -447,7 +440,7 @@ namespace cv { namespace gpu { namespace improc
        grid.x = divUp(src.cols, threads.x);
        grid.y = divUp(src.rows, threads.y);
-        imgproc::Gray2RGB5x5<GREEN_BITS><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+        imgproc_krnls::Gray2RGB5x5<GREEN_BITS><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
            dst.ptr, dst.step, src.rows, src.cols);
        if (stream == 0)
@@ -468,7 +461,7 @@ namespace cv { namespace gpu { namespace improc
 ///////////////////////////////// Color to Grayscale ////////////////////////////////
-namespace imgproc
+namespace imgproc_krnls
 {
    #undef R2Y
    #undef G2Y
@@ -550,7 +543,7 @@ namespace imgproc
    }   
 }
-namespace cv { namespace gpu { namespace improc
+namespace cv { namespace gpu { namespace imgproc
 {
    template <typename T, int SRCCN>
    void RGB2Gray_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
@@ -561,7 +554,7 @@ namespace cv { namespace gpu { namespace improc
        grid.x = divUp(src.cols, threads.x);
        grid.y = divUp(src.rows, threads.y);
-        imgproc::RGB2Gray<SRCCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+        imgproc_krnls::RGB2Gray<SRCCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
            dst.ptr, dst.step, src.rows, src.cols, bidx);
        if (stream == 0)
@@ -601,7 +594,7 @@ namespace cv { namespace gpu { namespace improc
        grid.x = divUp(src.cols, threads.x);
        grid.y = divUp(src.rows, threads.y);
-        imgproc::RGB5x52Gray<GREEN_BITS><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+        imgproc_krnls::RGB5x52Gray<GREEN_BITS><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
            dst.ptr, dst.step, src.rows, src.cols);
        if (stream == 0)
@@ -622,7 +615,7 @@ namespace cv { namespace gpu { namespace improc
 ///////////////////////////////////// RGB <-> YCrCb //////////////////////////////////////
-namespace imgproc
+namespace imgproc_krnls
 {
    __constant__ float cYCrCbCoeffs_f[5];
    __constant__ int cYCrCbCoeffs_i[5];
@@ -721,7 +714,7 @@ namespace imgproc
    }
 }
-namespace cv { namespace gpu { namespace improc
+namespace cv { namespace gpu { namespace imgproc
 {
    template <typename T, int SRCCN, int DSTCN>
    void RGB2YCrCb_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, cudaStream_t stream)
@@ -732,7 +725,7 @@ namespace cv { namespace gpu { namespace improc
        grid.x = divUp(src.cols, threads.x);
        grid.y = divUp(src.rows, threads.y);
-        imgproc::RGB2YCrCb<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+        imgproc_krnls::RGB2YCrCb<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
            dst.ptr, dst.step, src.rows, src.cols, bidx);
        if (stream == 0)
@@ -748,7 +741,7 @@ namespace cv { namespace gpu { namespace improc
            {RGB2YCrCb_caller<uchar, 4, 3>, RGB2YCrCb_caller<uchar, 4, 4>}
        };
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) );
        RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
    }
@@ -762,7 +755,7 @@ namespace cv { namespace gpu { namespace improc
            {RGB2YCrCb_caller<ushort, 4, 3>, RGB2YCrCb_caller<ushort, 4, 4>}
        };
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cYCrCbCoeffs_i, coeffs, 5 * sizeof(int)) );
        RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
    }
@@ -776,7 +769,7 @@ namespace cv { namespace gpu { namespace improc
            {RGB2YCrCb_caller<float, 4, 3>, RGB2YCrCb_caller<float, 4, 4>}
        };
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_f, coeffs, 5 * sizeof(float)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cYCrCbCoeffs_f, coeffs, 5 * sizeof(float)) );
        RGB2YCrCb_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
    }
@@ -790,7 +783,7 @@ namespace cv { namespace gpu { namespace improc
        grid.x = divUp(src.cols, threads.x);
        grid.y = divUp(src.rows, threads.y);
-        imgproc::YCrCb2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+        imgproc_krnls::YCrCb2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
            dst.ptr, dst.step, src.rows, src.cols, bidx);
        if (stream == 0)
@@ -806,7 +799,7 @@ namespace cv { namespace gpu { namespace improc
            {YCrCb2RGB_caller<uchar, 4, 3>, YCrCb2RGB_caller<uchar, 4, 4>}
        };
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) );
        YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
    }
@@ -820,7 +813,7 @@ namespace cv { namespace gpu { namespace improc
            {YCrCb2RGB_caller<ushort, 4, 3>, YCrCb2RGB_caller<ushort, 4, 4>}
        };
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cYCrCbCoeffs_i, coeffs, 4 * sizeof(int)) );
        YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
    }
@@ -834,7 +827,7 @@ namespace cv { namespace gpu { namespace improc
            {YCrCb2RGB_caller<float, 4, 3>, YCrCb2RGB_caller<float, 4, 4>}
        };
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cYCrCbCoeffs_f, coeffs, 4 * sizeof(float)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cYCrCbCoeffs_f, coeffs, 4 * sizeof(float)) );
        YCrCb2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, stream);
    }
@@ -842,7 +835,7 @@ namespace cv { namespace gpu { namespace improc
 ////////////////////////////////////// RGB <-> XYZ ///////////////////////////////////////
-namespace imgproc
+namespace imgproc_krnls
 {
    __constant__ float cXYZ_D65f[9];
    __constant__ int cXYZ_D65i[9];
@@ -931,7 +924,7 @@ namespace imgproc
    }
 }
-namespace cv { namespace gpu { namespace improc
+namespace cv { namespace gpu { namespace imgproc
 {
    template <typename T, int SRCCN, int DSTCN>
    void RGB2XYZ_caller(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
@@ -942,7 +935,7 @@ namespace cv { namespace gpu { namespace improc
        grid.x = divUp(src.cols, threads.x);
        grid.y = divUp(src.rows, threads.y);
-        imgproc::RGB2XYZ<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+        imgproc_krnls::RGB2XYZ<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
            dst.ptr, dst.step, src.rows, src.cols);
        if (stream == 0)
@@ -958,7 +951,7 @@ namespace cv { namespace gpu { namespace improc
            {RGB2XYZ_caller<uchar, 4, 3>, RGB2XYZ_caller<uchar, 4, 4>}
        };
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65i, coeffs, 9 * sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cXYZ_D65i, coeffs, 9 * sizeof(int)) );
        RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream);
    }
@@ -972,7 +965,7 @@ namespace cv { namespace gpu { namespace improc
            {RGB2XYZ_caller<ushort, 4, 3>, RGB2XYZ_caller<ushort, 4, 4>}
        };
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65i, coeffs, 9 * sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cXYZ_D65i, coeffs, 9 * sizeof(int)) );
        RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream);
    }
@@ -986,7 +979,7 @@ namespace cv { namespace gpu { namespace improc
            {RGB2XYZ_caller<float, 4, 3>, RGB2XYZ_caller<float, 4, 4>}
        };
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65f, coeffs, 9 * sizeof(float)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cXYZ_D65f, coeffs, 9 * sizeof(float)) );
        RGB2XYZ_callers[srccn-3][dstcn-3](src, dst, stream);
    }
@@ -1000,7 +993,7 @@ namespace cv { namespace gpu { namespace improc
        grid.x = divUp(src.cols, threads.x);
        grid.y = divUp(src.rows, threads.y);
-        imgproc::XYZ2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+        imgproc_krnls::XYZ2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
            dst.ptr, dst.step, src.rows, src.cols);
        if (stream == 0)
@@ -1016,7 +1009,7 @@ namespace cv { namespace gpu { namespace improc
            {XYZ2RGB_caller<uchar, 4, 3>, XYZ2RGB_caller<uchar, 4, 4>}
        };
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65i, coeffs, 9 * sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cXYZ_D65i, coeffs, 9 * sizeof(int)) );
        XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream);
    }
@@ -1030,7 +1023,7 @@ namespace cv { namespace gpu { namespace improc
            {XYZ2RGB_caller<ushort, 4, 3>, XYZ2RGB_caller<ushort, 4, 4>}
        };
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65i, coeffs, 9 * sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cXYZ_D65i, coeffs, 9 * sizeof(int)) );
        XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream);
    }
@@ -1044,7 +1037,7 @@ namespace cv { namespace gpu { namespace improc
            {XYZ2RGB_caller<float, 4, 3>, XYZ2RGB_caller<float, 4, 4>}
        };
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cXYZ_D65f, coeffs, 9 * sizeof(float)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cXYZ_D65f, coeffs, 9 * sizeof(float)) );
        XYZ2RGB_callers[srccn-3][dstcn-3](src, dst, stream);
    }
@@ -1052,7 +1045,7 @@ namespace cv { namespace gpu { namespace improc
 ////////////////////////////////////// RGB <-> HSV ///////////////////////////////////////
-namespace imgproc
+namespace imgproc_krnls
 {
    __constant__ int cHsvDivTable[256];
@@ -1229,7 +1222,7 @@ namespace imgproc
    }
 }
-namespace cv { namespace gpu { namespace improc
+namespace cv { namespace gpu { namespace imgproc
 {
    template <typename T, int SRCCN, int DSTCN>
    void RGB2HSV_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)
@@ -1241,10 +1234,10 @@ namespace cv { namespace gpu { namespace improc
        grid.y = divUp(src.rows, threads.y);
        if (hrange == 180)
-            imgproc::RGB2HSV<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+            imgproc_krnls::RGB2HSV<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
                dst.ptr, dst.step, src.rows, src.cols, bidx);
        else
-            imgproc::RGB2HSV<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+            imgproc_krnls::RGB2HSV<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
                dst.ptr, dst.step, src.rows, src.cols, bidx);
        if (stream == 0)
@@ -1295,7 +1288,7 @@ namespace cv { namespace gpu { namespace improc
            4352, 4334, 4316, 4298, 4281, 4263, 4246, 4229,
            4212, 4195, 4178, 4161, 4145, 4128, 4112, 4096
        };
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cHsvDivTable, div_table, sizeof(div_table)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cHsvDivTable, div_table, sizeof(div_table)) );
        RGB2HSV_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
    }
@@ -1323,10 +1316,10 @@ namespace cv { namespace gpu { namespace improc
        grid.y = divUp(src.rows, threads.y);
        if (hrange == 180)
-            imgproc::HSV2RGB<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+            imgproc_krnls::HSV2RGB<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
                dst.ptr, dst.step, src.rows, src.cols, bidx);
        else
-            imgproc::HSV2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+            imgproc_krnls::HSV2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
                dst.ptr, dst.step, src.rows, src.cols, bidx);
        if (stream == 0)
@@ -1345,7 +1338,7 @@ namespace cv { namespace gpu { namespace improc
        static const int sector_data[][3] =
            {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cHsvSectorData, sector_data, sizeof(sector_data)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cHsvSectorData, sector_data, sizeof(sector_data)) );
        HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
    }
@@ -1362,7 +1355,7 @@ namespace cv { namespace gpu { namespace improc
        static const int sector_data[][3] =
            {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cHsvSectorData, sector_data, sizeof(sector_data)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cHsvSectorData, sector_data, sizeof(sector_data)) );
        HSV2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
    }
@@ -1370,7 +1363,7 @@ namespace cv { namespace gpu { namespace improc
 /////////////////////////////////////// RGB <-> HLS ////////////////////////////////////////
-namespace imgproc
+namespace imgproc_krnls
 {
    template<typename T, int HR> struct RGB2HLSConvertor;
    template<int HR> struct RGB2HLSConvertor<float, HR>
@@ -1541,7 +1534,7 @@ namespace imgproc
    }
 }
-namespace cv { namespace gpu { namespace improc
+namespace cv { namespace gpu { namespace imgproc
 {
    template <typename T, int SRCCN, int DSTCN>
    void RGB2HLS_caller(const DevMem2D& src, const DevMem2D& dst, int bidx, int hrange, cudaStream_t stream)
@@ -1553,10 +1546,10 @@ namespace cv { namespace gpu { namespace improc
        grid.y = divUp(src.rows, threads.y);
        if (hrange == 180)
-            imgproc::RGB2HLS<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+            imgproc_krnls::RGB2HLS<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
                dst.ptr, dst.step, src.rows, src.cols, bidx);
        else
-            imgproc::RGB2HLS<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+            imgproc_krnls::RGB2HLS<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
                dst.ptr, dst.step, src.rows, src.cols, bidx);
        if (stream == 0)
@@ -1598,10 +1591,10 @@ namespace cv { namespace gpu { namespace improc
        grid.y = divUp(src.rows, threads.y);
        if (hrange == 180)
-            imgproc::HLS2RGB<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+            imgproc_krnls::HLS2RGB<SRCCN, DSTCN, 180, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
                dst.ptr, dst.step, src.rows, src.cols, bidx);
        else
-            imgproc::HLS2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
+            imgproc_krnls::HLS2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.ptr, src.step, 
                dst.ptr, dst.step, src.rows, src.cols, bidx);
        if (stream == 0)
@@ -1620,7 +1613,7 @@ namespace cv { namespace gpu { namespace improc
        static const int sector_data[][3]=
            {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cHlsSectorData, sector_data, sizeof(sector_data)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cHlsSectorData, sector_data, sizeof(sector_data)) );
        HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
    }
@@ -1637,7 +1630,7 @@ namespace cv { namespace gpu { namespace improc
        static const int sector_data[][3]=
            {{1,3,0}, {1,0,2}, {3,0,1}, {0,2,1}, {0,1,3}, {2,1,0}};
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cHlsSectorData, sector_data, sizeof(sector_data)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cHlsSectorData, sector_data, sizeof(sector_data)) );
        HLS2RGB_callers[srccn-3][dstcn-3](src, dst, bidx, hrange, stream);
    }

--- a/modules/gpu/src/cuda/constantspacebp.cu
+++ b/modules/gpu/src/cuda/constantspacebp.cu
@@ -54,20 +54,18 @@ using namespace cv::gpu;
 #define SHRT_MAX 32767
 #endif
-template <typename T>
+namespace csbp_krnls
-struct TypeLimits {};
-template <>
-struct TypeLimits<short>
-{
-    static __device__ short max() {return SHRT_MAX;}
-};
-template <>
-struct TypeLimits<float>
 {
-    static __device__ float max() {return FLT_MAX;}
+    template <typename T> struct TypeLimits;
-};
+    template <> struct TypeLimits<short>
+    {
+        static __device__ short max() {return SHRT_MAX;}
+    };
+    template <> struct TypeLimits<float>
+    {
+        static __device__ float max() {return FLT_MAX;}
+    };
+}
 ///////////////////////////////////////////////////////////////
 /////////////////////// load constants ////////////////////////

--- a/modules/gpu/src/cuda/cuda_shared.hpp
+++ b/modules/gpu/src/cuda/cuda_shared.hpp
@@ -58,19 +58,8 @@ namespace cv
        static inline int divUp(int a, int b) { return (a % b == 0) ? a/b : a/b + 1; }
-        namespace matrix_operations
-        {            
-            extern "C" void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
-            extern "C" void set_to_without_mask (DevMem2D dst, int depth, const double *scalar, int channels, const cudaStream_t & stream = 0);
-            extern "C" void set_to_with_mask    (DevMem2D dst, int depth, const double *scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
-            extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream = 0);
-        }
        template<class T> 
-        inline void uploadConstant(const char* name, const T& value) { cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); }
+        static inline void uploadConstant(const char* name, const T& value) { cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); }
    }
 }

--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
@@ -43,6 +43,7 @@
 #include "opencv2/gpu/devmem2d.hpp"
 #include "saturate_cast.hpp"
 #include "safe_call.hpp"
+#include "cuda_shared.hpp"
 using namespace cv::gpu;
@@ -50,6 +51,227 @@ using namespace cv::gpu;
 #define FLT_MAX 3.402823466e+30F
 #endif
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Linear filters
+#define MAX_KERNEL_SIZE 16
+namespace filter_krnls
+{
+    __constant__ float cLinearKernel[MAX_KERNEL_SIZE];
+}
+namespace cv { namespace gpu { namespace filters
+{
+    void loadLinearKernel(const float kernel[], int ksize)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(filter_krnls::cLinearKernel, kernel, ksize * sizeof(float)) );
+    }
+}}}
+namespace filter_krnls
+{
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int KERNEL_SIZE, typename T, typename D>
+    __global__ void linearRowFilter(const T* src, size_t src_step, D* dst, size_t dst_step, int anchor, int width, int height)
+    {
+        __shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
+        const int blockStartX = blockDim.x * blockIdx.x;
+        const int blockStartY = blockDim.y * blockIdx.y;
+		const int threadX = blockStartX + threadIdx.x;
+        const int prevThreadX = threadX - blockDim.x;
+        const int nextThreadX = threadX + blockDim.x;
+		const int threadY = blockStartY + threadIdx.y;
+        T* sDataRow = smem + threadIdx.y * blockDim.x * 3;
+        if (threadY < height)
+        {
+            const T* rowSrc = src + threadY * src_step;
+            sDataRow[threadIdx.x + blockDim.x] = threadX < width ? rowSrc[threadX] : 0;
+            sDataRow[threadIdx.x] = prevThreadX >= 0 ? rowSrc[prevThreadX] : 0;
+            sDataRow[(blockDim.x << 1) + threadIdx.x] = nextThreadX < width ? rowSrc[nextThreadX] : 0;
+            __syncthreads();
+            if (threadX < width)
+            {
+                float sum = 0;
+                sDataRow += threadIdx.x + blockDim.x - anchor;
+                #pragma unroll
+                for(int i = 0; i < KERNEL_SIZE; ++i)
+                    sum += cLinearKernel[i] * sDataRow[i];
+                dst[threadY * dst_step + threadX] = saturate_cast<D>(sum);
+            }
+        }
+    }
+}
+namespace cv { namespace gpu { namespace filters
+{
+    template <int KERNEL_SIZE, typename T, typename D>
+    void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor)
+    {
+        const int BLOCK_DIM_X = 16;
+        const int BLOCK_DIM_Y = 16;
+        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
+        dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
+        filter_krnls::linearRowFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE><<<blocks, threads>>>(src.ptr, src.elem_step, 
+            dst.ptr, dst.elem_step, anchor, src.cols, src.rows);
+        cudaSafeCall( cudaThreadSynchronize() );
+    }
+    template <typename T, typename D>
+    inline void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor);
+        static const caller_t callers[] = 
+        {linearRowFilter_caller<0 , T, D>, linearRowFilter_caller<1 , T, D>, 
+         linearRowFilter_caller<2 , T, D>, linearRowFilter_caller<3 , T, D>, 
+         linearRowFilter_caller<4 , T, D>, linearRowFilter_caller<5 , T, D>, 
+         linearRowFilter_caller<6 , T, D>, linearRowFilter_caller<7 , T, D>, 
+         linearRowFilter_caller<8 , T, D>, linearRowFilter_caller<9 , T, D>, 
+         linearRowFilter_caller<10, T, D>, linearRowFilter_caller<11, T, D>, 
+         linearRowFilter_caller<12, T, D>, linearRowFilter_caller<13, T, D>, 
+         linearRowFilter_caller<14, T, D>, linearRowFilter_caller<15, T, D>};
+        loadLinearKernel(kernel, ksize);
+        callers[ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);
+    }
+    void linearRowFilter_gpu_32s32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearRowFilter_gpu<int, int>(src, dst, kernel, ksize, anchor);
+    }
+    void linearRowFilter_gpu_32s32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearRowFilter_gpu<int, float>(src, dst, kernel, ksize, anchor);
+    }
+    void linearRowFilter_gpu_32f32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearRowFilter_gpu<float, int>(src, dst, kernel, ksize, anchor);
+    }
+    void linearRowFilter_gpu_32f32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearRowFilter_gpu<float, float>(src, dst, kernel, ksize, anchor);
+    }
+}}}
+namespace filter_krnls
+{
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int KERNEL_SIZE, typename T, typename D>
+    __global__ void linearColumnFilter(const T* src, size_t src_step, D* dst, size_t dst_step, int anchor, int width, int height)
+    {
+        __shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
+        const int blockStartX = blockDim.x * blockIdx.x;
+        const int blockStartY = blockDim.y * blockIdx.y;
+		const int threadX = blockStartX + threadIdx.x;
+		const int threadY = blockStartY + threadIdx.y;
+        const int prevThreadY = threadY - blockDim.y;
+        const int nextThreadY = threadY + blockDim.y;
+        const int smem_step = blockDim.x;
+        T* sDataColumn = smem + threadIdx.x;
+        if (threadX < width)
+        {
+            const T* colSrc = src + threadX;
+            sDataColumn[(threadIdx.y + blockDim.y) * smem_step] = threadY < height ? colSrc[threadY * src_step] : 0;
+            sDataColumn[threadIdx.y * smem_step] = prevThreadY >= 0 ? colSrc[prevThreadY * src_step] : 0;
+            sDataColumn[(threadIdx.y + (blockDim.y << 1)) * smem_step] = nextThreadY < height ? colSrc[nextThreadY * src_step] : 0;
+            __syncthreads();
+            if (threadY < height)
+            {
+                float sum = 0;
+                sDataColumn += (threadIdx.y + blockDim.y - anchor)* smem_step;
+                #pragma unroll
+                for(int i = 0; i < KERNEL_SIZE; ++i)
+                    sum += cLinearKernel[i] * sDataColumn[i * smem_step];
+                dst[threadY * dst_step + threadX] = saturate_cast<D>(sum);
+            }
+        }
+    }
+}
+namespace cv { namespace gpu { namespace filters
+{
+    template <int KERNEL_SIZE, typename T, typename D>
+    void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor)
+    {
+        const int BLOCK_DIM_X = 16;
+        const int BLOCK_DIM_Y = 16;
+        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
+        dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
+        filter_krnls::linearColumnFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE><<<blocks, threads>>>(src.ptr, src.elem_step, 
+            dst.ptr, dst.elem_step, anchor, src.cols, src.rows);
+        cudaSafeCall( cudaThreadSynchronize() );
+    }
+    template <typename T, typename D>
+    inline void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor);
+        static const caller_t callers[] = 
+        {linearColumnFilter_caller<0 , T, D>, linearColumnFilter_caller<1 , T, D>, 
+         linearColumnFilter_caller<2 , T, D>, linearColumnFilter_caller<3 , T, D>, 
+         linearColumnFilter_caller<4 , T, D>, linearColumnFilter_caller<5 , T, D>, 
+         linearColumnFilter_caller<6 , T, D>, linearColumnFilter_caller<7 , T, D>, 
+         linearColumnFilter_caller<8 , T, D>, linearColumnFilter_caller<9 , T, D>, 
+         linearColumnFilter_caller<10, T, D>, linearColumnFilter_caller<11, T, D>, 
+         linearColumnFilter_caller<12, T, D>, linearColumnFilter_caller<13, T, D>, 
+         linearColumnFilter_caller<14, T, D>, linearColumnFilter_caller<15, T, D>};
+        loadLinearKernel(kernel, ksize);
+        callers[ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);
+    }
+    void linearColumnFilter_gpu_32s32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearColumnFilter_gpu<int, int>(src, dst, kernel, ksize, anchor);
+    }
+    void linearColumnFilter_gpu_32s32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearColumnFilter_gpu<int, float>(src, dst, kernel, ksize, anchor);
+    }
+    void linearColumnFilter_gpu_32f32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearColumnFilter_gpu<float, int>(src, dst, kernel, ksize, anchor);
+    }
+    void linearColumnFilter_gpu_32f32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearColumnFilter_gpu<float, float>(src, dst, kernel, ksize, anchor);
+    }
+}}}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Bilateral filters
 namespace bf_krnls
 {
    __constant__ float* ctable_color;

--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -45,7 +45,7 @@
 using namespace cv::gpu;
 /////////////////////////////////// Remap ///////////////////////////////////////////////
-namespace imgproc
+namespace imgproc_krnls
 {
    texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex_remap;
@@ -123,7 +123,7 @@ namespace imgproc
    }
 }
-namespace cv { namespace gpu { namespace improc 
+namespace cv { namespace gpu { namespace imgproc 
 {
    void remap_gpu_1c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst)
    {
@@ -132,15 +132,15 @@ namespace cv { namespace gpu { namespace improc
        grid.x = divUp(dst.cols, threads.x);
        grid.y = divUp(dst.rows, threads.y);
-        imgproc::tex_remap.filterMode = cudaFilterModeLinear;	    
+        imgproc_krnls::tex_remap.filterMode = cudaFilterModeLinear;	    
-        imgproc::tex_remap.addressMode[0] = imgproc::tex_remap.addressMode[1] = cudaAddressModeWrap;
+        imgproc_krnls::tex_remap.addressMode[0] = imgproc_krnls::tex_remap.addressMode[1] = cudaAddressModeWrap;
        cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
-        cudaSafeCall( cudaBindTexture2D(0, imgproc::tex_remap, src.ptr, desc, src.cols, src.rows, src.step) );
+        cudaSafeCall( cudaBindTexture2D(0, imgproc_krnls::tex_remap, src.ptr, desc, src.cols, src.rows, src.step) );
-        imgproc::remap_1c<<<grid, threads>>>(xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows);
+        imgproc_krnls::remap_1c<<<grid, threads>>>(xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows);
        cudaSafeCall( cudaThreadSynchronize() );  
-        cudaSafeCall( cudaUnbindTexture(imgproc::tex_remap) );
+        cudaSafeCall( cudaUnbindTexture(imgproc_krnls::tex_remap) );
    }
    void remap_gpu_3c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst)
@@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace improc
        grid.x = divUp(dst.cols, threads.x);
        grid.y = divUp(dst.rows, threads.y);
-        imgproc::remap_3c<<<grid, threads>>>(src.ptr, src.step, xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows);
+        imgproc_krnls::remap_3c<<<grid, threads>>>(src.ptr, src.step, xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows);
        cudaSafeCall( cudaThreadSynchronize() ); 
    }
@@ -159,7 +159,7 @@ namespace cv { namespace gpu { namespace improc
 /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
-namespace imgproc
+namespace imgproc_krnls
 {
    texture<uchar4, 2> tex_meanshift;
@@ -254,7 +254,7 @@ namespace imgproc
    }
 }
-namespace cv { namespace gpu { namespace improc 
+namespace cv { namespace gpu { namespace imgproc 
 {
    extern "C" void meanShiftFiltering_gpu(const DevMem2D& src, DevMem2D dst, int sp, int sr, int maxIter, float eps)
    {                        
@@ -264,11 +264,11 @@ namespace cv { namespace gpu { namespace improc
        grid.y = divUp(src.rows, threads.y);
        cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
-        cudaSafeCall( cudaBindTexture2D( 0, imgproc::tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) );
+        cudaSafeCall( cudaBindTexture2D( 0, imgproc_krnls::tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) );
-        imgproc::meanshift_kernel<<< grid, threads >>>( dst.ptr, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
+        imgproc_krnls::meanshift_kernel<<< grid, threads >>>( dst.ptr, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
        cudaSafeCall( cudaThreadSynchronize() );
-        cudaSafeCall( cudaUnbindTexture( imgproc::tex_meanshift ) );        
+        cudaSafeCall( cudaUnbindTexture( imgproc_krnls::tex_meanshift ) );        
    }
    extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps) 
    {
@@ -278,17 +278,17 @@ namespace cv { namespace gpu { namespace improc
        grid.y = divUp(src.rows, threads.y);
        cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
-        cudaSafeCall( cudaBindTexture2D( 0, imgproc::tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) );
+        cudaSafeCall( cudaBindTexture2D( 0, imgproc_krnls::tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) );
-        imgproc::meanshiftproc_kernel<<< grid, threads >>>( dstr.ptr, dstr.step, dstsp.ptr, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
+        imgproc_krnls::meanshiftproc_kernel<<< grid, threads >>>( dstr.ptr, dstr.step, dstsp.ptr, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
        cudaSafeCall( cudaThreadSynchronize() );
-        cudaSafeCall( cudaUnbindTexture( imgproc::tex_meanshift ) );        
+        cudaSafeCall( cudaUnbindTexture( imgproc_krnls::tex_meanshift ) );        
    }
 }}}
 /////////////////////////////////// drawColorDisp ///////////////////////////////////////////////
-namespace imgproc
+namespace imgproc_krnls
 {
    template <typename T>
    __device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)
@@ -391,7 +391,7 @@ namespace imgproc
    }
 }
-namespace cv { namespace gpu { namespace improc 
+namespace cv { namespace gpu { namespace imgproc 
 {
    void drawColorDisp_gpu(const DevMem2D& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream)
    {
@@ -400,7 +400,7 @@ namespace cv { namespace gpu { namespace improc
        grid.x = divUp(src.cols, threads.x << 2);
        grid.y = divUp(src.rows, threads.y);
-        imgproc::drawColorDisp<<<grid, threads, 0, stream>>>(src.ptr, src.step, dst.ptr, dst.step, src.cols, src.rows, ndisp);
+        imgproc_krnls::drawColorDisp<<<grid, threads, 0, stream>>>(src.ptr, src.step, dst.ptr, dst.step, src.cols, src.rows, ndisp);
        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() ); 
@@ -413,7 +413,7 @@ namespace cv { namespace gpu { namespace improc
        grid.x = divUp(src.cols, threads.x << 1);
        grid.y = divUp(src.rows, threads.y);
-        imgproc::drawColorDisp<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(short), dst.ptr, dst.step, src.cols, src.rows, ndisp);
+        imgproc_krnls::drawColorDisp<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(short), dst.ptr, dst.step, src.cols, src.rows, ndisp);
        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -422,7 +422,7 @@ namespace cv { namespace gpu { namespace improc
 /////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////
-namespace imgproc
+namespace imgproc_krnls
 {
    __constant__ float cq[16];
@@ -457,7 +457,7 @@ namespace imgproc
    }
 }
-namespace cv { namespace gpu { namespace improc 
+namespace cv { namespace gpu { namespace imgproc 
 {
    template <typename T>
    inline void reprojectImageTo3D_caller(const DevMem2D_<T>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
@@ -467,9 +467,9 @@ namespace cv { namespace gpu { namespace improc
        grid.x = divUp(disp.cols, threads.x);
        grid.y = divUp(disp.rows, threads.y);
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cq, q, 16 * sizeof(float)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cq, q, 16 * sizeof(float)) );
-        imgproc::reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.ptr, disp.step / sizeof(T), xyzw.ptr, xyzw.step / sizeof(float), disp.rows, disp.cols);
+        imgproc_krnls::reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.ptr, disp.step / sizeof(T), xyzw.ptr, xyzw.step / sizeof(float), disp.rows, disp.cols);
        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );

--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -41,6 +41,9 @@
 //M*/
 #include "cuda_shared.hpp"
+#include "saturate_cast.hpp"
+#include "transform.hpp"
+#include "vecmath.hpp"
 using namespace cv::gpu;
@@ -48,6 +51,9 @@ using namespace cv::gpu;
 #define CV_PI   3.1415926535897932384626433832795f
 #endif
+//////////////////////////////////////////////////////////////////////////////////////
+// Cart <-> Polar
 namespace mathfunc_krnls 
 {
    struct Nothing
@@ -143,8 +149,8 @@ namespace cv { namespace gpu { namespace mathfunc
        const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
        mathfunc_krnls::cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
-            x.ptr, x.step / sizeof(float), y.ptr, y.step / sizeof(float), 
+            x.ptr, x.elem_step, y.ptr, y.elem_step, 
-            mag.ptr, mag.step / sizeof(float), angle.ptr, angle.step / sizeof(float), scale, x.cols, x.rows);
+            mag.ptr, mag.elem_step, angle.ptr, angle.elem_step, scale, x.cols, x.rows);
        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -191,8 +197,8 @@ namespace cv { namespace gpu { namespace mathfunc
        const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
-        mathfunc_krnls::polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.ptr, mag.step / sizeof(float), 
+        mathfunc_krnls::polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.ptr, mag.elem_step, 
-            angle.ptr, angle.step / sizeof(float), scale, x.ptr, x.step / sizeof(float), y.ptr, y.step / sizeof(float), mag.cols, mag.rows);
+            angle.ptr, angle.elem_step, scale, x.ptr, x.elem_step, y.ptr, y.elem_step, mag.cols, mag.rows);
        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -210,3 +216,37 @@ namespace cv { namespace gpu { namespace mathfunc
        callers[mag.ptr == 0](mag, angle, x, y, angleInDegrees, stream);
    }
 }}}
+//////////////////////////////////////////////////////////////////////////////////////
+// Compare
+namespace mathfunc_krnls 
+{
+    template <typename T1, typename T2>
+    struct NotEqual
+    {
+        __device__ uchar operator()(const T1& src1, const T2& src2, int, int)
+        {
+            return static_cast<uchar>(static_cast<int>(src1 != src2) * 255);
+        }
+    };
+}
+namespace cv { namespace gpu { namespace mathfunc 
+{
+    template <typename T1, typename T2>
+    inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
+    {
+        mathfunc_krnls::NotEqual<T1, T2> op;
+        transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, 0);
+    }
+    void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
+    {
+        compare_ne<uint, uint>(src1, src2, dst);
+    }
+    void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
+    {
+        compare_ne<float, float>(src1, src2, dst);
+    }
+}}}
--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
@@ -47,73 +47,84 @@
 #include "saturate_cast.hpp"
 using namespace cv::gpu;
-using namespace cv::gpu::matrix_operations;
+namespace matop_krnls
-namespace mat_operators
 {
-    __constant__ double scalar_d[4];
+    template <typename T> struct shift_and_sizeof;
+    template <> struct shift_and_sizeof<char> { enum { shift = 0 }; };
+    template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
-    template <typename T>
+    template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };
-    class shift_and_sizeof;
+    template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };
+    template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
-    template <>
+    template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
-    class shift_and_sizeof<char>
+    template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };
+    template <typename T, typename DT, size_t src_elem_size, size_t dst_elem_size>
+    struct ReadWriteTraits
    {
-        public:
+        enum {shift=1};
-        enum { shift = 0 };
-    };
-    template <>
+        typedef T read_type;
-    class shift_and_sizeof<unsigned char>
+        typedef DT write_type;
-    {
-        public:
-        enum { shift = 0 };
    };
+    template <typename T, typename DT>
-    template <>
+    struct ReadWriteTraits<T, DT, 1, 1>
-    class shift_and_sizeof<short>
    {
-        public:
+        enum {shift=4};
-        enum { shift = 1 };
-    };
-    template <>
+        typedef char4 read_type;
-    class shift_and_sizeof<unsigned short>
+        typedef char4 write_type;
-    {
-        public:
-        enum { shift = 1 };
    };
+    template <typename T, typename DT>
-    template <>
+    struct ReadWriteTraits<T, DT, 2, 1>
-    class shift_and_sizeof<int>
    {
-        public:
+        enum {shift=4};
-        enum { shift = 2 };
-    };
-    template <>
+        typedef short4 read_type;
-    class shift_and_sizeof<float>
+        typedef char4 write_type;
+    };
+    template <typename T, typename DT>
+    struct ReadWriteTraits<T, DT, 4, 1>
    {
-        public:
+        enum {shift=4};
-        enum { shift = 2 };
+        typedef int4 read_type;
+        typedef char4 write_type;
    };
+    template <typename T, typename DT>
+    struct ReadWriteTraits<T, DT, 1, 2>
+    {
+        enum {shift=2};
-    template <>
+        typedef char2 read_type;
-    class shift_and_sizeof<double>
+        typedef short2 write_type;
+    };
+    template <typename T, typename DT>
+    struct ReadWriteTraits<T, DT, 2, 2>
    {
-        public:
+        enum {shift=2};
-        enum { shift = 3 };
+        typedef short2 read_type;
+        typedef short2 write_type;
    };
+    template <typename T, typename DT>
+    struct ReadWriteTraits<T, DT, 4, 2>
+    {
+        enum {shift=2};
+        typedef int2 read_type;
+        typedef short2 write_type;
+    };
+}
-    ///////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////
-    ////////////////////////////////// CopyTo /////////////////////////////////
+////////////////////////////////// CopyTo /////////////////////////////////
-    ///////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////
+namespace matop_krnls
+{
    template<typename T>
-    __global__ void kernel_copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, int step_mat, int step_mask, int channels)
+    __global__ void copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, int step_mat, int step_mask, int channels)
    {
        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -125,13 +136,62 @@ namespace mat_operators
                mat_dst[idx] = mat_src[idx];
            }
    }
+}
+namespace cv { namespace gpu { namespace matrix_operations
+{
+    typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream);
+    template<typename T>
+    void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream)
+    {
+        dim3 threadsPerBlock(16,16, 1);
+        dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
+        if (stream == 0)
+        {
+            ::matop_krnls::copy_to_with_mask<T><<<numBlocks,threadsPerBlock>>>
+                ((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
+            cudaSafeCall ( cudaThreadSynchronize() );
+        }
+        else
+        {
+            ::matop_krnls::copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
+                ((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
+        }
+    }
+    void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)
+    {
+        static CopyToFunc tab[8] =
+        {
+            copy_to_with_mask_run<unsigned char>,
+            copy_to_with_mask_run<char>,
+            copy_to_with_mask_run<unsigned short>,
+            copy_to_with_mask_run<short>,
+            copy_to_with_mask_run<int>,
+            copy_to_with_mask_run<float>,
+            copy_to_with_mask_run<double>,
+            0
+        };
+        CopyToFunc func = tab[depth];
+        if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);
+        func(mat_src, mat_dst, mask, channels, stream);
+    }
+}}}
+///////////////////////////////////////////////////////////////////////////
+////////////////////////////////// SetTo //////////////////////////////////
+///////////////////////////////////////////////////////////////////////////
-    ///////////////////////////////////////////////////////////////////////////
+namespace matop_krnls
-    ////////////////////////////////// SetTo //////////////////////////////////
+{
-    ///////////////////////////////////////////////////////////////////////////
+    __constant__ double scalar_d[4]; 
    template<typename T>
-    __global__ void kernel_set_to_without_mask(T * mat, int cols, int rows, int step, int channels)
+    __global__ void set_to_without_mask(T * mat, int cols, int rows, int step, int channels)
    {
        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -144,7 +204,7 @@ namespace mat_operators
    }
    template<typename T>
-    __global__ void kernel_set_to_with_mask(T * mat, const unsigned char * mask, int cols, int rows, int step, int channels, int step_mask)
+    __global__ void set_to_with_mask(T * mat, const unsigned char * mask, int cols, int rows, int step, int channels, int step_mask)
    {
        size_t x = blockIdx.x * blockDim.x + threadIdx.x;
        size_t y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -156,71 +216,105 @@ namespace mat_operators
                mat[idx] = scalar_d[ x % channels ];
            }
    }
+}
+namespace cv { namespace gpu {  namespace matrix_operations
+{
+    typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream);
+    typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels, const cudaStream_t & stream);
-    ///////////////////////////////////////////////////////////////////////////
+    template <typename T>
-    //////////////////////////////// ConvertTo ////////////////////////////////
+    void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream)
-    ///////////////////////////////////////////////////////////////////////////
-    template <typename T, typename DT, size_t src_elem_size, size_t dst_elem_size>
-    struct ReadWriteTraits
    {
-        enum {shift=1};
+        dim3 threadsPerBlock(32, 8, 1);
+        dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
-        typedef T read_type;
+        if (stream == 0)
-        typedef DT write_type;
+        {
-    };
+            ::matop_krnls::set_to_with_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
-    template <typename T, typename DT>
+            cudaSafeCall ( cudaThreadSynchronize() );
-    struct ReadWriteTraits<T, DT, 1, 1>
+        }
-    {
+        else
-        enum {shift=4};
+        {
+            ::matop_krnls::set_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
+        }
-        typedef char4 read_type;
+    }
-        typedef char4 write_type;
-    };
-    template <typename T, typename DT>
-    struct ReadWriteTraits<T, DT, 2, 1>
-    {
-        enum {shift=4};
-        typedef short4 read_type;
+    template <typename T>
-        typedef char4 write_type;
+    void set_to_without_mask_run(const DevMem2D& mat, int channels, const cudaStream_t & stream)
-    };
-    template <typename T, typename DT>
-    struct ReadWriteTraits<T, DT, 4, 1>
    {
-        enum {shift=4};
+        dim3 threadsPerBlock(32, 8, 1);
+        dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
-        typedef int4 read_type;
+        if (stream == 0)
-        typedef char4 write_type;
+        {
-    };
+            matop_krnls::set_to_without_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
-    template <typename T, typename DT>
+            cudaSafeCall ( cudaThreadSynchronize() );
-    struct ReadWriteTraits<T, DT, 1, 2>
+        }
-    {
+        else
-        enum {shift=2};
+        {
+            matop_krnls::set_to_without_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
+        }
+    }
-        typedef char2 read_type;
+    void set_to_without_mask(DevMem2D mat, int depth, const double *scalar, int channels, const cudaStream_t & stream)
-        typedef short2 write_type;
-    };
-    template <typename T, typename DT>
-    struct ReadWriteTraits<T, DT, 2, 2>
    {
-        enum {shift=2};
+        cudaSafeCall( cudaMemcpyToSymbol(matop_krnls::scalar_d, scalar, sizeof(double) * 4));
-        typedef short2 read_type;
+        static SetToFunc_without_mask tab[8] =
-        typedef short2 write_type;
+        {
-    };
+            set_to_without_mask_run<unsigned char>,
-    template <typename T, typename DT>
+            set_to_without_mask_run<char>,
-    struct ReadWriteTraits<T, DT, 4, 2>
+            set_to_without_mask_run<unsigned short>,
+            set_to_without_mask_run<short>,
+            set_to_without_mask_run<int>,
+            set_to_without_mask_run<float>,
+            set_to_without_mask_run<double>,
+            0
+        };
+        SetToFunc_without_mask func = tab[depth];
+        if (func == 0)
+            cv::gpu::error("Unsupported setTo operation", __FILE__, __LINE__);
+        func(mat, channels, stream);
+    }
+    void set_to_with_mask(DevMem2D mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream)
    {
-        enum {shift=2};
+        cudaSafeCall( cudaMemcpyToSymbol(matop_krnls::scalar_d, scalar, sizeof(double) * 4));
-        typedef int2 read_type;
+        static SetToFunc_with_mask tab[8] =
-        typedef short2 write_type;
+        {
-    };
+            set_to_with_mask_run<unsigned char>,
+            set_to_with_mask_run<char>,
+            set_to_with_mask_run<unsigned short>,
+            set_to_with_mask_run<short>,
+            set_to_with_mask_run<int>,
+            set_to_with_mask_run<float>,
+            set_to_with_mask_run<double>,
+            0
+        };
+        SetToFunc_with_mask func = tab[depth];
+        if (func == 0)
+            cv::gpu::error("Unsupported setTo operation", __FILE__, __LINE__);
+        func(mat, mask, channels, stream);
+    }
+}}}
+///////////////////////////////////////////////////////////////////////////
+//////////////////////////////// ConvertTo ////////////////////////////////
+///////////////////////////////////////////////////////////////////////////
+namespace matop_krnls
+{
    template <typename T, typename DT>
-    __global__ static void kernel_convert_to(uchar* srcmat, size_t src_step, uchar* dstmat, size_t dst_step, size_t width, size_t height, double alpha, double beta)
+    __global__ static void convert_to(uchar* srcmat, size_t src_step, uchar* dstmat, size_t dst_step, size_t width, size_t height, double alpha, double beta)
    {
        typedef typename ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::read_type read_type;
        typedef typename ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::write_type write_type;
@@ -253,253 +347,63 @@ namespace mat_operators
                        dst[(x * shift) + i] = saturate_cast<DT>(alpha * src[(x * shift) + i] + beta);
            }
        }
-    }
+    }    
+}
-    ///////////////////////////////////////////////////////////////////////////
+namespace cv { namespace gpu { namespace matrix_operations
-    /////////////////////////////// compare_ne ////////////////////////////////
+{
-    ///////////////////////////////////////////////////////////////////////////
+    typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream);
-    template <typename T>
+    template<typename T, typename DT>
-    __global__ void kernel_compare_ne(uchar* src1, size_t src1_step, uchar* src2, size_t src2_step, uchar* dst, size_t dst_step, int cols, int rows)
+    void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream)
    {
-        const size_t x = threadIdx.x + blockIdx.x * blockDim.x;
+        const int shift = ::matop_krnls::ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::shift;
-        const size_t y = threadIdx.y + blockIdx.y * blockDim.y;
-        if (x < cols && y < rows)
+        dim3 block(32, 8);
+        dim3 grid(divUp(width, block.x * shift), divUp(height, block.y));
+        if (stream == 0)
+        {
+            matop_krnls::convert_to<T, DT><<<grid, block>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
+            cudaSafeCall( cudaThreadSynchronize() );
+        }
+        else
        {
-            T src1_pix = ((T*)(src1 + y * src1_step))[x];
+            matop_krnls::convert_to<T, DT><<<grid, block, 0, stream>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
-            T src2_pix = ((T*)(src2 + y * src2_step))[x];
-            uchar res = (uchar)(src1_pix != src2_pix) * 255;
-            ((dst + y * dst_step))[x] = res;
        }
    }
-} // namespace mat_operators
-namespace cv
+    void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream)
-{
-    namespace gpu
    {
-        namespace matrix_operations
+        static CvtFunc tab[8][8] =
        {
+            {cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
+            cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
-            ///////////////////////////////////////////////////////////////////////////
+            {cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
-            ////////////////////////////////// CopyTo /////////////////////////////////
+            cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
-            ///////////////////////////////////////////////////////////////////////////
-            typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream);
-            template<typename T>
-            void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream)
-            {
-                dim3 threadsPerBlock(16,16, 1);
-                dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
-                if (stream == 0)
-                {
-                    ::mat_operators::kernel_copy_to_with_mask<T><<<numBlocks,threadsPerBlock>>>
-                        ((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
-                    cudaSafeCall ( cudaThreadSynchronize() );
-                }
-                else
-                {
-                    ::mat_operators::kernel_copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
-                        ((T*)mat_src.ptr, (T*)mat_dst.ptr, (unsigned char*)mask.ptr, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
-                }
-            }
-            extern "C" void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)
-            {
-                static CopyToFunc tab[8] =
-                {
-                    copy_to_with_mask_run<unsigned char>,
-                    copy_to_with_mask_run<char>,
-                    copy_to_with_mask_run<unsigned short>,
-                    copy_to_with_mask_run<short>,
-                    copy_to_with_mask_run<int>,
-                    copy_to_with_mask_run<float>,
-                    copy_to_with_mask_run<double>,
-                    0
-                };
-                CopyToFunc func = tab[depth];
-                if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);
-                func(mat_src, mat_dst, mask, channels, stream);
-            }
-            ///////////////////////////////////////////////////////////////////////////
-            ////////////////////////////////// SetTo //////////////////////////////////
-            ///////////////////////////////////////////////////////////////////////////
-            typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream);
-            typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels, const cudaStream_t & stream);
-            template <typename T>
-            void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream)
-            {
-                dim3 threadsPerBlock(32, 8, 1);
-                dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
-                if (stream == 0)
-                {
-                    ::mat_operators::kernel_set_to_with_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
-                    cudaSafeCall ( cudaThreadSynchronize() );
-                }
-                else
-                {
-                    ::mat_operators::kernel_set_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, (unsigned char *)mask.ptr, mat.cols, mat.rows, mat.step, channels, mask.step);
-                }
-            }
-            template <typename T>
-            void set_to_without_mask_run(const DevMem2D& mat, int channels, const cudaStream_t & stream)
-            {
-                dim3 threadsPerBlock(32, 8, 1);
-                dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
-                if (stream == 0)
-                {
-                    mat_operators::kernel_set_to_without_mask<T><<<numBlocks,threadsPerBlock>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
-                    cudaSafeCall ( cudaThreadSynchronize() );
-                }
-                else
-                {
-                    mat_operators::kernel_set_to_without_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.ptr, mat.cols, mat.rows, mat.step, channels);
-                }
-            }
-            extern "C" void set_to_without_mask(DevMem2D mat, int depth, const double *scalar, int channels, const cudaStream_t & stream)
-            {
-                cudaSafeCall( cudaMemcpyToSymbol(mat_operators::scalar_d, scalar, sizeof(double) * 4));
-                static SetToFunc_without_mask tab[8] =
-                {
-                    set_to_without_mask_run<unsigned char>,
-                    set_to_without_mask_run<char>,
-                    set_to_without_mask_run<unsigned short>,
-                    set_to_without_mask_run<short>,
-                    set_to_without_mask_run<int>,
-                    set_to_without_mask_run<float>,
-                    set_to_without_mask_run<double>,
-                    0
-                };
-                SetToFunc_without_mask func = tab[depth];
-                if (func == 0)
-                    cv::gpu::error("Unsupported setTo operation", __FILE__, __LINE__);
-                func(mat, channels, stream);
-            }
-            extern "C" void set_to_with_mask(DevMem2D mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream)
-            {
-                cudaSafeCall( cudaMemcpyToSymbol(mat_operators::scalar_d, scalar, sizeof(double) * 4));
-                static SetToFunc_with_mask tab[8] =
-                {
-                    set_to_with_mask_run<unsigned char>,
-                    set_to_with_mask_run<char>,
-                    set_to_with_mask_run<unsigned short>,
-                    set_to_with_mask_run<short>,
-                    set_to_with_mask_run<int>,
-                    set_to_with_mask_run<float>,
-                    set_to_with_mask_run<double>,
-                    0
-                };
-                SetToFunc_with_mask func = tab[depth];
-                if (func == 0)
-                    cv::gpu::error("Unsupported setTo operation", __FILE__, __LINE__);
-                func(mat, mask, channels, stream);
-            }
-            ///////////////////////////////////////////////////////////////////////////
-            //////////////////////////////// ConvertTo ////////////////////////////////
-            ///////////////////////////////////////////////////////////////////////////
-            typedef void (*CvtFunc)(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream);
-            template<typename T, typename DT>
-            void cvt_(const DevMem2D& src, DevMem2D& dst, size_t width, size_t height, double alpha, double beta, const cudaStream_t & stream)
-            {
-                const int shift = ::mat_operators::ReadWriteTraits<T, DT, sizeof(T), sizeof(DT)>::shift;
-                dim3 block(32, 8);
-                dim3 grid(divUp(width, block.x * shift), divUp(height, block.y));
-                if (stream == 0)
-                {
-                    mat_operators::kernel_convert_to<T, DT><<<grid, block>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
-                    cudaSafeCall( cudaThreadSynchronize() );
-                }
-                else
-                {
-                    mat_operators::kernel_convert_to<T, DT><<<grid, block, 0, stream>>>(src.ptr, src.step, dst.ptr, dst.step, width, height, alpha, beta);
-                }
-            }
-            extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream)
-            {
-                static CvtFunc tab[8][8] =
-                {
-                    {cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
-                    cvt_<uchar, int>, cvt_<uchar, float>, cvt_<uchar, double>, 0},
-                    {cvt_<schar, uchar>, cvt_<schar, schar>, cvt_<schar, ushort>, cvt_<schar, short>,
-                    cvt_<schar, int>, cvt_<schar, float>, cvt_<schar, double>, 0},
-                    {cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
-                    cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
-                    {cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
-                    cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
-                    {cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
-                    cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
-                    {cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
-                    cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
-                    {cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
-                    cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
-                    {0,0,0,0,0,0,0,0}
+            {cvt_<ushort, uchar>, cvt_<ushort, schar>, cvt_<ushort, ushort>, cvt_<ushort, short>,
-                };
+            cvt_<ushort, int>, cvt_<ushort, float>, cvt_<ushort, double>, 0},
-                CvtFunc func = tab[sdepth][ddepth];
+            {cvt_<short, uchar>, cvt_<short, schar>, cvt_<short, ushort>, cvt_<short, short>,
-                if (func == 0)
+            cvt_<short, int>, cvt_<short, float>, cvt_<short, double>, 0},
-                    cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
-                func(src, dst, src.cols * channels, src.rows, alpha, beta, stream);
-            }
-            ///////////////////////////////////////////////////////////////////////////
+            {cvt_<int, uchar>, cvt_<int, schar>, cvt_<int, ushort>,
-            /////////////////////////////// compare_ne ////////////////////////////////
+            cvt_<int, short>, cvt_<int, int>, cvt_<int, float>, cvt_<int, double>, 0},
-            ///////////////////////////////////////////////////////////////////////////
-            void compare_ne_8u(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
+            {cvt_<float, uchar>, cvt_<float, schar>, cvt_<float, ushort>,
-            {
+            cvt_<float, short>, cvt_<float, int>, cvt_<float, float>, cvt_<float, double>, 0},
-                dim3 block(32, 8);
-                dim3 grid(divUp(src1.cols, block.x), divUp(src1.rows, block.y));
-                mat_operators::kernel_compare_ne<uint><<<grid, block>>>(src1.ptr, src1.step, src2.ptr, src2.step, dst.ptr, dst.step, src1.cols, src1.rows);
+            {cvt_<double, uchar>, cvt_<double, schar>, cvt_<double, ushort>,
-                cudaSafeCall( cudaThreadSynchronize() );
+            cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
-            }
-            void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
+            {0,0,0,0,0,0,0,0}
-            {
+        };
-                dim3 block(32, 8);
-                dim3 grid(divUp(src1.cols, block.x), divUp(src1.rows, block.y));
-                mat_operators::kernel_compare_ne<float><<<grid, block>>>(src1.ptr, src1.step, src2.ptr, src2.step, dst.ptr, dst.step, src1.cols, src1.rows);
+        CvtFunc func = tab[sdepth][ddepth];
-                cudaSafeCall( cudaThreadSynchronize() );
+        if (func == 0)
-            }
+            cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
-        } // namespace matrix_operations
+        func(src, dst, src.cols * channels, src.rows, alpha, beta, stream);
-    } // namespace gpu
+    }
-} // namespace cv
+}}}
--- a/modules/gpu/src/cuda/saturate_cast.hpp
+++ b/modules/gpu/src/cuda/saturate_cast.hpp
@@ -49,124 +49,206 @@ namespace cv
 {
    namespace gpu
    {
-        // To fix link error: this func already defined in other obj file
+        template<typename _Tp> static __device__ _Tp saturate_cast(uchar v) { return _Tp(v); }
-        namespace 
+        template<typename _Tp> static __device__ _Tp saturate_cast(schar v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(ushort v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(short v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(uint v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(int v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(float v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(double v) { return _Tp(v); }
+        template<> static __device__ uchar saturate_cast<uchar>(schar v)
+        { return (uchar)max((int)v, 0); }
+        template<> static __device__ uchar saturate_cast<uchar>(ushort v)
+        { return (uchar)min((uint)v, (uint)UCHAR_MAX); }
+        template<> static __device__ uchar saturate_cast<uchar>(int v)
+        { return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
+        template<> static __device__ uchar saturate_cast<uchar>(uint v)
+        { return (uchar)min(v, (uint)UCHAR_MAX); }
+        template<> static __device__ uchar saturate_cast<uchar>(short v)
+        { return saturate_cast<uchar>((uint)v); }
+        template<> static __device__ uchar saturate_cast<uchar>(float v)
+        { int iv = __float2int_rn(v); return saturate_cast<uchar>(iv); }
+        template<> static __device__ uchar saturate_cast<uchar>(double v)
+        {
+        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
+            int iv = __double2int_rn(v); return saturate_cast<uchar>(iv);
+        #else
+            return saturate_cast<uchar>((float)v);
+        #endif
+        }
+        template<> static __device__ schar saturate_cast<schar>(uchar v)
+        { return (schar)min((int)v, SCHAR_MAX); }
+        template<> static __device__ schar saturate_cast<schar>(ushort v)
+        { return (schar)min((uint)v, (uint)SCHAR_MAX); }
+        template<> static __device__ schar saturate_cast<schar>(int v)
+        {
+            return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ?
+                        v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
+        }
+        template<> static __device__ schar saturate_cast<schar>(short v)
+        { return saturate_cast<schar>((int)v); }
+        template<> static __device__ schar saturate_cast<schar>(uint v)
+        { return (schar)min(v, (uint)SCHAR_MAX); }
+        template<> static __device__ schar saturate_cast<schar>(float v)
+        { int iv = __float2int_rn(v); return saturate_cast<schar>(iv); }
+        template<> static __device__ schar saturate_cast<schar>(double v)
+        {             
+        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
+            int iv = __double2int_rn(v); return saturate_cast<schar>(iv);
+        #else
+            return saturate_cast<schar>((float)v);
+        #endif
+        }
+        template<> static __device__ ushort saturate_cast<ushort>(schar v)
+        { return (ushort)max((int)v, 0); }
+        template<> static __device__ ushort saturate_cast<ushort>(short v)
+        { return (ushort)max((int)v, 0); }
+        template<> static __device__ ushort saturate_cast<ushort>(int v)
+        { return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
+        template<> static __device__ ushort saturate_cast<ushort>(uint v)
+        { return (ushort)min(v, (uint)USHRT_MAX); }
+        template<> static __device__ ushort saturate_cast<ushort>(float v)
+        { int iv = __float2int_rn(v); return saturate_cast<ushort>(iv); }
+        template<> static __device__ ushort saturate_cast<ushort>(double v)
+        {             
+        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
+            int iv = __double2int_rn(v); return saturate_cast<ushort>(iv);
+        #else
+            return saturate_cast<ushort>((float)v);
+        #endif
+        }
+        template<> static __device__ short saturate_cast<short>(ushort v)
+        { return (short)min((int)v, SHRT_MAX); }
+        template<> static __device__ short saturate_cast<short>(int v)
        {
-            template<typename _Tp> __device__ _Tp saturate_cast(uchar v) { return _Tp(v); }
+            return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ?
-            template<typename _Tp> __device__ _Tp saturate_cast(schar v) { return _Tp(v); }
+                    v : v > 0 ? SHRT_MAX : SHRT_MIN);
-            template<typename _Tp> __device__ _Tp saturate_cast(ushort v) { return _Tp(v); }
-            template<typename _Tp> __device__ _Tp saturate_cast(short v) { return _Tp(v); }
-            template<typename _Tp> __device__ _Tp saturate_cast(uint v) { return _Tp(v); }
-            template<typename _Tp> __device__ _Tp saturate_cast(int v) { return _Tp(v); }
-            template<typename _Tp> __device__ _Tp saturate_cast(float v) { return _Tp(v); }
-            template<typename _Tp> __device__ _Tp saturate_cast(double v) { return _Tp(v); }
-            template<> __device__ uchar saturate_cast<uchar>(schar v)
-            { return (uchar)max((int)v, 0); }
-            template<> __device__ uchar saturate_cast<uchar>(ushort v)
-            { return (uchar)min((uint)v, (uint)UCHAR_MAX); }
-            template<> __device__ uchar saturate_cast<uchar>(int v)
-            { return (uchar)((uint)v <= UCHAR_MAX ? v : v > 0 ? UCHAR_MAX : 0); }
-            template<> __device__ uchar saturate_cast<uchar>(uint v)
-            { return (uchar)min(v, (uint)UCHAR_MAX); }
-            template<> __device__ uchar saturate_cast<uchar>(short v)
-            { return saturate_cast<uchar>((uint)v); }
-            template<> __device__ uchar saturate_cast<uchar>(float v)
-            { int iv = __float2int_rn(v); return saturate_cast<uchar>(iv); }
-            template<> __device__ uchar saturate_cast<uchar>(double v)
-            {
-            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-                int iv = __double2int_rn(v); return saturate_cast<uchar>(iv);
-            #else
-                return saturate_cast<uchar>((float)v);
-            #endif
-            }
-            template<> __device__ schar saturate_cast<schar>(uchar v)
-            { return (schar)min((int)v, SCHAR_MAX); }
-            template<> __device__ schar saturate_cast<schar>(ushort v)
-            { return (schar)min((uint)v, (uint)SCHAR_MAX); }
-            template<> __device__ schar saturate_cast<schar>(int v)
-            {
-                return (schar)((uint)(v-SCHAR_MIN) <= (uint)UCHAR_MAX ?
-                            v : v > 0 ? SCHAR_MAX : SCHAR_MIN);
-            }
-            template<> __device__ schar saturate_cast<schar>(short v)
-            { return saturate_cast<schar>((int)v); }
-            template<> __device__ schar saturate_cast<schar>(uint v)
-            { return (schar)min(v, (uint)SCHAR_MAX); }
-            template<> __device__ schar saturate_cast<schar>(float v)
-            { int iv = __float2int_rn(v); return saturate_cast<schar>(iv); }
-            template<> __device__ schar saturate_cast<schar>(double v)
-            {             
-            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-                int iv = __double2int_rn(v); return saturate_cast<schar>(iv);
-            #else
-                return saturate_cast<schar>((float)v);
-            #endif
-            }
-            template<> __device__ ushort saturate_cast<ushort>(schar v)
-            { return (ushort)max((int)v, 0); }
-            template<> __device__ ushort saturate_cast<ushort>(short v)
-            { return (ushort)max((int)v, 0); }
-            template<> __device__ ushort saturate_cast<ushort>(int v)
-            { return (ushort)((uint)v <= (uint)USHRT_MAX ? v : v > 0 ? USHRT_MAX : 0); }
-            template<> __device__ ushort saturate_cast<ushort>(uint v)
-            { return (ushort)min(v, (uint)USHRT_MAX); }
-            template<> __device__ ushort saturate_cast<ushort>(float v)
-            { int iv = __float2int_rn(v); return saturate_cast<ushort>(iv); }
-            template<> __device__ ushort saturate_cast<ushort>(double v)
-            {             
-            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-                int iv = __double2int_rn(v); return saturate_cast<ushort>(iv);
-            #else
-                return saturate_cast<ushort>((float)v);
-            #endif
-            }
-            template<> __device__ short saturate_cast<short>(ushort v)
-            { return (short)min((int)v, SHRT_MAX); }
-            template<> __device__ short saturate_cast<short>(int v)
-            {
-                return (short)((uint)(v - SHRT_MIN) <= (uint)USHRT_MAX ?
-                        v : v > 0 ? SHRT_MAX : SHRT_MIN);
-            }
-            template<> __device__ short saturate_cast<short>(uint v)
-            { return (short)min(v, (uint)SHRT_MAX); }
-            template<> __device__ short saturate_cast<short>(float v)
-            { int iv = __float2int_rn(v); return saturate_cast<short>(iv); }
-            template<> __device__ short saturate_cast<short>(double v)
-            {            
-            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-                int iv = __double2int_rn(v); return saturate_cast<short>(iv);
-            #else
-                return saturate_cast<short>((float)v);
-            #endif
-            }
-            template<> __device__ int saturate_cast<int>(float v) { return __float2int_rn(v); }
-            template<> __device__ int saturate_cast<int>(double v) 
-            {
-            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 
-                return __double2int_rn(v);
-            #else
-                return saturate_cast<int>((float)v);
-            #endif
-            }
-            template<> __device__ uint saturate_cast<uint>(float v){ return __float2uint_rn(v); }
-            template<> __device__ uint saturate_cast<uint>(double v) 
-            {            
-            #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
-                return __double2uint_rn(v);
-            #else
-                return saturate_cast<uint>((float)v);
-            #endif
-            }
        }
+        template<> static __device__ short saturate_cast<short>(uint v)
+        { return (short)min(v, (uint)SHRT_MAX); }
+        template<> static __device__ short saturate_cast<short>(float v)
+        { int iv = __float2int_rn(v); return saturate_cast<short>(iv); }
+        template<> static __device__ short saturate_cast<short>(double v)
+        {            
+        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
+            int iv = __double2int_rn(v); return saturate_cast<short>(iv);
+        #else
+            return saturate_cast<short>((float)v);
+        #endif
+        }
+        template<> static __device__ int saturate_cast<int>(float v) { return __float2int_rn(v); }
+        template<> static __device__ int saturate_cast<int>(double v) 
+        {
+        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130 
+            return __double2int_rn(v);
+        #else
+            return saturate_cast<int>((float)v);
+        #endif
+        }
+        template<> static __device__ uint saturate_cast<uint>(float v){ return __float2uint_rn(v); }
+        template<> static __device__ uint saturate_cast<uint>(double v) 
+        {            
+        #if defined (__CUDA_ARCH__) && __CUDA_ARCH__ >= 130
+            return __double2uint_rn(v);
+        #else
+            return saturate_cast<uint>((float)v);
+        #endif
+        }
+        template<typename _Tp> static __device__ _Tp saturate_cast(uchar4 v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(char4 v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(ushort4 v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(short4 v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(uint4 v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(int4 v) { return _Tp(v); }
+        template<typename _Tp> static __device__ _Tp saturate_cast(float4 v) { return _Tp(v); }
+        template<> static __device__ uchar4 saturate_cast<uchar4>(char4 v)
+        { return make_uchar4(saturate_cast<uchar>(v.x), saturate_cast<uchar>(v.y), saturate_cast<uchar>(v.z), saturate_cast<uchar>(v.w)); }
+        template<> static __device__ uchar4 saturate_cast<uchar4>(ushort4 v)
+        { return make_uchar4(saturate_cast<uchar>(v.x), saturate_cast<uchar>(v.y), saturate_cast<uchar>(v.z), saturate_cast<uchar>(v.w)); }
+        template<> static __device__ uchar4 saturate_cast<uchar4>(short4 v)
+        { return make_uchar4(saturate_cast<uchar>(v.x), saturate_cast<uchar>(v.y), saturate_cast<uchar>(v.z), saturate_cast<uchar>(v.w)); }
+        template<> static __device__ uchar4 saturate_cast<uchar4>(uint4 v)
+        { return make_uchar4(saturate_cast<uchar>(v.x), saturate_cast<uchar>(v.y), saturate_cast<uchar>(v.z), saturate_cast<uchar>(v.w)); }
+        template<> static __device__ uchar4 saturate_cast<uchar4>(int4 v)
+        { return make_uchar4(saturate_cast<uchar>(v.x), saturate_cast<uchar>(v.y), saturate_cast<uchar>(v.z), saturate_cast<uchar>(v.w)); }
+        template<> static __device__ uchar4 saturate_cast<uchar4>(float4 v)
+        { return make_uchar4(saturate_cast<uchar>(v.x), saturate_cast<uchar>(v.y), saturate_cast<uchar>(v.z), saturate_cast<uchar>(v.w)); }
+        template<> static __device__ char4 saturate_cast<char4>(uchar4 v)
+        { return make_char4(saturate_cast<char>(v.x), saturate_cast<char>(v.y), saturate_cast<char>(v.z), saturate_cast<char>(v.w)); }
+        template<> static __device__ char4 saturate_cast<char4>(ushort4 v)
+        { return make_char4(saturate_cast<char>(v.x), saturate_cast<char>(v.y), saturate_cast<char>(v.z), saturate_cast<char>(v.w)); }
+        template<> static __device__ char4 saturate_cast<char4>(short4 v)
+        { return make_char4(saturate_cast<char>(v.x), saturate_cast<char>(v.y), saturate_cast<char>(v.z), saturate_cast<char>(v.w)); }
+        template<> static __device__ char4 saturate_cast<char4>(uint4 v)
+        { return make_char4(saturate_cast<char>(v.x), saturate_cast<char>(v.y), saturate_cast<char>(v.z), saturate_cast<char>(v.w)); }
+        template<> static __device__ char4 saturate_cast<char4>(int4 v)
+        { return make_char4(saturate_cast<char>(v.x), saturate_cast<char>(v.y), saturate_cast<char>(v.z), saturate_cast<char>(v.w)); }
+        template<> static __device__ char4 saturate_cast<char4>(float4 v)
+        { return make_char4(saturate_cast<char>(v.x), saturate_cast<char>(v.y), saturate_cast<char>(v.z), saturate_cast<char>(v.w)); }
+        template<> static __device__ ushort4 saturate_cast<ushort4>(uchar4 v)
+        { return make_ushort4(v.x, v.y, v.z, v.w); }
+        template<> static __device__ ushort4 saturate_cast<ushort4>(char4 v)
+        { return make_ushort4(saturate_cast<ushort>(v.x), saturate_cast<ushort>(v.y), saturate_cast<ushort>(v.z), saturate_cast<ushort>(v.w)); }
+        template<> static __device__ ushort4 saturate_cast<ushort4>(short4 v)
+        { return make_ushort4(saturate_cast<ushort>(v.x), saturate_cast<ushort>(v.y), saturate_cast<ushort>(v.z), saturate_cast<ushort>(v.w)); }
+        template<> static __device__ ushort4 saturate_cast<ushort4>(uint4 v)
+        { return make_ushort4(saturate_cast<ushort>(v.x), saturate_cast<ushort>(v.y), saturate_cast<ushort>(v.z), saturate_cast<ushort>(v.w)); }
+        template<> static __device__ ushort4 saturate_cast<ushort4>(int4 v)
+        { return make_ushort4(saturate_cast<ushort>(v.x), saturate_cast<ushort>(v.y), saturate_cast<ushort>(v.z), saturate_cast<ushort>(v.w)); }
+        template<> static __device__ ushort4 saturate_cast<ushort4>(float4 v)
+        { return make_ushort4(saturate_cast<ushort>(v.x), saturate_cast<ushort>(v.y), saturate_cast<ushort>(v.z), saturate_cast<ushort>(v.w)); }
+        template<> static __device__ short4 saturate_cast<short4>(uchar4 v)
+        { return make_short4(v.x, v.y, v.z, v.w); }
+        template<> static __device__ short4 saturate_cast<short4>(char4 v)
+        { return make_short4(v.x, v.y, v.z, v.w); }
+        template<> static __device__ short4 saturate_cast<short4>(ushort4 v)
+        { return make_short4(saturate_cast<short>(v.x), saturate_cast<short>(v.y), saturate_cast<short>(v.z), saturate_cast<short>(v.w)); }
+        template<> static __device__ short4 saturate_cast<short4>(uint4 v)
+        { return make_short4(saturate_cast<short>(v.x), saturate_cast<short>(v.y), saturate_cast<short>(v.z), saturate_cast<short>(v.w)); }
+        template<> static __device__ short4 saturate_cast<short4>(int4 v)
+        { return make_short4(saturate_cast<short>(v.x), saturate_cast<short>(v.y), saturate_cast<short>(v.z), saturate_cast<short>(v.w)); }
+        template<> static __device__ short4 saturate_cast<short4>(float4 v)
+        { return make_short4(saturate_cast<short>(v.x), saturate_cast<short>(v.y), saturate_cast<short>(v.z), saturate_cast<short>(v.w)); }
+        template<> static __device__ uint4 saturate_cast<uint4>(uchar4 v)
+        { return make_uint4(v.x, v.y, v.z, v.w); }
+        template<> static __device__ uint4 saturate_cast<uint4>(char4 v)
+        { return make_uint4(saturate_cast<uint>(v.x), saturate_cast<uint>(v.y), saturate_cast<uint>(v.z), saturate_cast<uint>(v.w)); }
+        template<> static __device__ uint4 saturate_cast<uint4>(ushort4 v)
+        { return make_uint4(v.x, v.y, v.z, v.w); }
+        template<> static __device__ uint4 saturate_cast<uint4>(short4 v)
+        { return make_uint4(saturate_cast<uint>(v.x), saturate_cast<uint>(v.y), saturate_cast<uint>(v.z), saturate_cast<uint>(v.w)); }
+        template<> static __device__ uint4 saturate_cast<uint4>(int4 v)
+        { return make_uint4(saturate_cast<uint>(v.x), saturate_cast<uint>(v.y), saturate_cast<uint>(v.z), saturate_cast<uint>(v.w)); }
+        template<> static __device__ uint4 saturate_cast<uint4>(float4 v)
+        { return make_uint4(saturate_cast<uint>(v.x), saturate_cast<uint>(v.y), saturate_cast<uint>(v.z), saturate_cast<uint>(v.w)); }
+        template<> static __device__ int4 saturate_cast<int4>(uchar4 v)
+        { return make_int4(v.x, v.y, v.z, v.w); }
+        template<> static __device__ int4 saturate_cast<int4>(char4 v)
+        { return make_int4(v.x, v.y, v.z, v.w); }
+        template<> static __device__ int4 saturate_cast<int4>(ushort4 v)
+        { return make_int4(v.x, v.y, v.z, v.w); }
+        template<> static __device__ int4 saturate_cast<int4>(short4 v)
+        { return make_int4(v.x, v.y, v.z, v.w); }
+        template<> static __device__ int4 saturate_cast<int4>(uint4 v)
+        { return make_int4(saturate_cast<int>(v.x), saturate_cast<int>(v.y), saturate_cast<int>(v.z), saturate_cast<int>(v.w)); }
+        template<> static __device__ int4 saturate_cast<int4>(float4 v)
+        { return make_int4(saturate_cast<int>(v.x), saturate_cast<int>(v.y), saturate_cast<int>(v.z), saturate_cast<int>(v.w)); }
    }
 }

--- a/modules/gpu/src/cuda/transform.hpp
+++ b/modules/gpu/src/cuda/transform.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_GPU_TRANSFORM_HPP__
+#define __OPENCV_GPU_TRANSFORM_HPP__
+#include "cuda_shared.hpp"
+#include "saturate_cast.hpp"
+#include "vecmath.hpp"
+namespace cv { namespace gpu { namespace algo_krnls
+{
+    template <typename T, typename D, typename UnOp>
+    static __global__ void transform(const T* src, size_t src_step, 
+                                     D* dst, size_t dst_step, int width, int height, UnOp op)
+    {
+		const int x = blockDim.x * blockIdx.x + threadIdx.x;
+		const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        if (x < width && y < height)
+        {
+            T src_data = src[y * src_step + x];
+            dst[y * dst_step + x] = op(src_data, x, y);
+        }
+    }
+    template <typename T1, typename T2, typename D, typename BinOp>
+    static __global__ void transform(const T1* src1, size_t src1_step, const T2* src2, size_t src2_step, 
+                                     D* dst, size_t dst_step, int width, int height, BinOp op)
+    {
+		const int x = blockDim.x * blockIdx.x + threadIdx.x;
+		const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        if (x < width && y < height)
+        {
+            T1 src1_data = src1[y * src1_step + x];
+            T2 src2_data = src2[y * src2_step + x];
+            dst[y * dst_step + x] = op(src1_data, src2_data, x, y);
+        }
+    }
+}}}
+namespace cv 
+{ 
+    namespace gpu 
+    {
+        template <typename T, typename D, typename UnOp>
+        static void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, cudaStream_t stream)
+        {
+            dim3 threads(16, 16, 1);
+            dim3 grid(1, 1, 1);
+            grid.x = divUp(src.cols, threads.x);
+            grid.y = divUp(src.rows, threads.y);        
+            algo_krnls::transform<<<grid, threads, 0, stream>>>(src.ptr, src.elem_step, 
+                dst.ptr, dst.elem_step, src.cols, src.rows, op);
+            if (stream == 0)
+                cudaSafeCall( cudaThreadSynchronize() );
+        }
+        template <typename T1, typename T2, typename D, typename BinOp>
+        static void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, BinOp op, cudaStream_t stream)
+        {
+            dim3 threads(16, 16, 1);
+            dim3 grid(1, 1, 1);
+            grid.x = divUp(src1.cols, threads.x);
+            grid.y = divUp(src1.rows, threads.y);        
+            algo_krnls::transform<<<grid, threads, 0, stream>>>(src1.ptr, src1.elem_step, 
+                src2.ptr, src2.elem_step, dst.ptr, dst.elem_step, src1.cols, src1.rows, op);
+            if (stream == 0)
+                cudaSafeCall( cudaThreadSynchronize() );
+        }
+    }
+}
+#endif // __OPENCV_GPU_TRANSFORM_HPP__
--- a/modules/gpu/src/cuda/vecmath.hpp
+++ b/modules/gpu/src/cuda/vecmath.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_GPU_VECMATH_HPP__
+#define __OPENCV_GPU_VECMATH_HPP__
+#include "cuda_shared.hpp"
+namespace cv
+{
+    namespace gpu
+    {
+        template<typename T, int N> struct TypeVec;
+        template<typename T> struct TypeVec<T, 1> { typedef T vec_t; };
+        template<> struct TypeVec<unsigned char, 2> { typedef uchar2 vec_t; };
+        template<> struct TypeVec<uchar2, 2> { typedef uchar2 vec_t; };
+        template<> struct TypeVec<unsigned char, 3> { typedef uchar3 vec_t; };;
+        template<> struct TypeVec<uchar3, 3> { typedef uchar3 vec_t; };
+        template<> struct TypeVec<unsigned char, 4> { typedef uchar4 vec_t; };;
+        template<> struct TypeVec<uchar4, 4> { typedef uchar4 vec_t; };
+        template<> struct TypeVec<char, 2> { typedef char2 vec_t; };
+        template<> struct TypeVec<char2, 2> { typedef char2 vec_t; };
+        template<> struct TypeVec<char, 3> { typedef char3 vec_t; };
+        template<> struct TypeVec<char3, 3> { typedef char3 vec_t; };
+        template<> struct TypeVec<char, 4> { typedef char4 vec_t; };
+        template<> struct TypeVec<char4, 4> { typedef char4 vec_t; };
+        template<> struct TypeVec<unsigned short, 2> { typedef ushort2 vec_t; };
+        template<> struct TypeVec<ushort2, 2> { typedef ushort2 vec_t; };
+        template<> struct TypeVec<unsigned short, 3> { typedef ushort3 vec_t; };
+        template<> struct TypeVec<ushort3, 3> { typedef ushort3 vec_t; };
+        template<> struct TypeVec<unsigned short, 4> { typedef ushort4 vec_t; };
+        template<> struct TypeVec<ushort4, 4> { typedef ushort4 vec_t; };
+        template<> struct TypeVec<short, 2> { typedef short2 vec_t; };
+        template<> struct TypeVec<short2, 2> { typedef short2 vec_t; };
+        template<> struct TypeVec<short, 3> { typedef short3 vec_t; };
+        template<> struct TypeVec<short3, 3> { typedef short3 vec_t; };
+        template<> struct TypeVec<short, 4> { typedef short4 vec_t; };
+        template<> struct TypeVec<short4, 4> { typedef short4 vec_t; };
+        template<> struct TypeVec<unsigned int, 2> { typedef uint2 vec_t; };
+        template<> struct TypeVec<uint2, 2> { typedef uint2 vec_t; };
+        template<> struct TypeVec<unsigned int, 3> { typedef uint3 vec_t; };
+        template<> struct TypeVec<uint3, 3> { typedef uint3 vec_t; };
+        template<> struct TypeVec<unsigned int, 4> { typedef uint4 vec_t; };
+        template<> struct TypeVec<uint4, 4> { typedef uint4 vec_t; };
+        template<> struct TypeVec<int, 2> { typedef int2 vec_t; };
+        template<> struct TypeVec<int2, 2> { typedef int2 vec_t; };
+        template<> struct TypeVec<int, 3> { typedef int3 vec_t; };
+        template<> struct TypeVec<int3, 3> { typedef int3 vec_t; };
+        template<> struct TypeVec<int, 4> { typedef int4 vec_t; };
+        template<> struct TypeVec<int4, 4> { typedef int4 vec_t; };
+        template<> struct TypeVec<float, 2> { typedef float2 vec_t; };
+        template<> struct TypeVec<float2, 2> { typedef float2 vec_t; };
+        template<> struct TypeVec<float, 3> { typedef float3 vec_t; };
+        template<> struct TypeVec<float3, 3> { typedef float3 vec_t; };
+        template<> struct TypeVec<float, 4> { typedef float4 vec_t; };
+        template<> struct TypeVec<float4, 4> { typedef float4 vec_t; };        
+        static __device__ uchar4 operator+(const uchar4& a, const uchar4& b)
+        {
+            return make_uchar4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+        }
+        static __device__ uchar4 operator-(const uchar4& a, const uchar4& b)
+        {
+            return make_uchar4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+        }
+        static __device__ uchar4 operator*(const uchar4& a, const uchar4& b)
+        {
+            return make_uchar4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
+        }
+        static __device__ uchar4 operator/(const uchar4& a, const uchar4& b)
+        {
+            return make_uchar4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
+        }
+        template <typename T>
+        static __device__ uchar4 operator*(const uchar4& a, T s)
+        {
+            return make_uchar4(a.x * s, a.y * s, a.z * s, a.w * s);
+        }
+        template <typename T>
+        static __device__ uchar4 operator*(T s, const uchar4& a)
+        {
+            return a * s;
+        }
+    }
+}
+#endif // __OPENCV_GPU_VECMATH_HPP__
\ No newline at end of file
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@@ -69,6 +69,22 @@ void cv::gpu::Stream::enqueueConvert(const GpuMat& /*src*/, GpuMat& /*dst*/, int
 #include "opencv2/gpu/stream_accessor.hpp"
+namespace cv 
+{
+    namespace gpu
+    {
+        namespace matrix_operations
+        {            
+            void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
+            void set_to_without_mask (DevMem2D dst, int depth, const double *scalar, int channels, const cudaStream_t & stream = 0);
+            void set_to_with_mask    (DevMem2D dst, int depth, const double *scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
+            void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream = 0);
+        }
+    }
+}
 struct Stream::Impl
 {
    cudaStream_t stream;

--- a/modules/gpu/src/filtering_npp.cpp
+++ b/modules/gpu/src/filtering_npp.cpp
@@ -49,18 +49,18 @@ using namespace cv::gpu;
 #if !defined (HAVE_CUDA)
 Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU>) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&, bool) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
+Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>&, const Ptr<BaseColumnFilter_GPU>&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<BaseRowFilter_GPU> cv::gpu::getRowSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); }
 Ptr<BaseColumnFilter_GPU> cv::gpu::getColumnSumFilter_GPU(int, int, int, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); }
 Ptr<BaseFilter_GPU> cv::gpu::getBoxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createBoxFilter_GPU(int, int, const Size&, const Point&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int, int, const GpuMat&, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
+Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int, int, const Mat&, const Point&, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int, int, const GpuMat&, const Size&, Point, int) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
+Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int, int, const Mat&, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int, int, const Mat&, const Point&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
-Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int, int, const GpuMat&, int, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); }
+Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int, int, const Mat&, int) { throw_nogpu(); return Ptr<BaseRowFilter_GPU>(0); }
-Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int, int, const GpuMat&, int, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); }
+Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int, int, const Mat&, int) { throw_nogpu(); return Ptr<BaseColumnFilter_GPU>(0); }
-Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&, bool) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
+Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int, int, const Mat&, const Mat&, const Point&) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int, int, int, int, int) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<FilterEngine_GPU> cv::gpu::createGaussianFilter_GPU(int, Size, double, double) { throw_nogpu(); return Ptr<FilterEngine_GPU>(0); }
 Ptr<BaseFilter_GPU> cv::gpu::getMaxFilter_GPU(int, int, const Size&, Point) { throw_nogpu(); return Ptr<BaseFilter_GPU>(0); }
@@ -71,7 +71,7 @@ void cv::gpu::erode( const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nog
 void cv::gpu::dilate( const GpuMat&, GpuMat&, const Mat&, Point, int) { throw_nogpu(); }
 void cv::gpu::morphologyEx( const GpuMat&, GpuMat&, int, const Mat&, Point, int) { throw_nogpu(); }
 void cv::gpu::filter2D(const GpuMat&, GpuMat&, int, const Mat&, Point) { throw_nogpu(); }
-void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point, bool) { throw_nogpu(); }
+void cv::gpu::sepFilter2D(const GpuMat&, GpuMat&, int, const Mat&, const Mat&, Point) { throw_nogpu(); }
 void cv::gpu::Sobel(const GpuMat&, GpuMat&, int, int, int, int, double) { throw_nogpu(); }
 void cv::gpu::Scharr(const GpuMat&, GpuMat&, int, int, int, double) { throw_nogpu(); }
 void cv::gpu::GaussianBlur(const GpuMat&, GpuMat&, Size, double, double) { throw_nogpu(); }
@@ -164,28 +164,10 @@ Ptr<FilterEngine_GPU> cv::gpu::createFilter2D_GPU(const Ptr<BaseFilter_GPU> filt
 namespace
 {
-    struct RowColumnFilterApply
+    class SeparableFilterEngine_GPU : public FilterEngine_GPU
-    {
-        void operator()(Ptr<BaseRowFilter_GPU>& rowFilter, Ptr<BaseColumnFilter_GPU>& columnFilter, 
-            GpuMat& srcROI, GpuMat& dstROI, GpuMat& dstBufROI)
-        {
-            (*rowFilter)(srcROI, dstBufROI);
-            (*columnFilter)(dstBufROI, dstROI);
-        }
-    };
-    struct ColumnRowFilterApply
-    {
-        void operator()(Ptr<BaseRowFilter_GPU>& rowFilter, Ptr<BaseColumnFilter_GPU>& columnFilter, 
-            GpuMat& srcROI, GpuMat& dstROI, GpuMat& dstBufROI)
-        {
-            (*columnFilter)(srcROI, dstBufROI);
-            (*rowFilter)(dstBufROI, dstROI);
-        }
-    };
-    class SeparableFilterEngine_GPU_base : public FilterEngine_GPU
    {
    public:
-        SeparableFilterEngine_GPU_base(const Ptr<BaseRowFilter_GPU>& rowFilter_, 
+        SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, 
                                       const Ptr<BaseColumnFilter_GPU>& columnFilter_) :
            rowFilter(rowFilter_), columnFilter(columnFilter_)
        {
@@ -208,6 +190,9 @@ namespace
            srcROI = src(roi);
            dstROI = dst(roi);
            dstBufROI = dstBuf(roi);
+            (*rowFilter)(srcROI, dstBufROI);
+            (*columnFilter)(dstBufROI, dstROI);
        }
        Ptr<BaseRowFilter_GPU> rowFilter;
@@ -219,32 +204,12 @@ namespace
        GpuMat dstROI;
        GpuMat dstBufROI;
    };
-    template <typename FA>
-    class SeparableFilterEngine_GPU : public SeparableFilterEngine_GPU_base
-    {
-    public:
-        SeparableFilterEngine_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter_, 
-                                  const Ptr<BaseColumnFilter_GPU>& columnFilter_, FA fa_) :
-            SeparableFilterEngine_GPU_base(rowFilter_, columnFilter_), fa(fa_)
-        {
-        }
-        virtual void apply(const GpuMat& src, GpuMat& dst, Rect roi = Rect(0,0,-1,-1))
-        {
-            SeparableFilterEngine_GPU_base::apply(src, dst, roi);
-            fa(rowFilter, columnFilter, srcROI, dstROI, dstBufROI);
-        }
-        FA fa;
-    };
 }
 Ptr<FilterEngine_GPU> cv::gpu::createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter, 
-    const Ptr<BaseColumnFilter_GPU>& columnFilter, bool rowFilterFirst)
+    const Ptr<BaseColumnFilter_GPU>& columnFilter)
 {
-    if (rowFilterFirst)
+    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU(rowFilter, columnFilter));
-        return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU<RowColumnFilterApply>(rowFilter, columnFilter, RowColumnFilterApply()));
-    return Ptr<FilterEngine_GPU>(new SeparableFilterEngine_GPU<ColumnRowFilterApply>(rowFilter, columnFilter, ColumnRowFilterApply()));
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -398,7 +363,7 @@ namespace
    };
 }
-Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int op, int type, const GpuMat& kernel, const Size& ksize, Point anchor)
+Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize, Point anchor)
 {
    static const nppMorfFilter_t nppMorfFilter_callers[2][5] = 
    {
@@ -408,11 +373,12 @@ Ptr<BaseFilter_GPU> cv::gpu::getMorphologyFilter_GPU(int op, int type, const Gpu
    CV_Assert(op == MORPH_ERODE || op == MORPH_DILATE);   
    CV_Assert(type == CV_8UC1 || type == CV_8UC4); 
-    CV_Assert(kernel.type() == CV_8UC1 && kernel.rows == 1 && kernel.cols == ksize.area());
+    GpuMat gpu_krnl;
+    normalizeKernel(kernel, gpu_krnl);
    normalizeAnchor(anchor, ksize);
-    return Ptr<BaseFilter_GPU>(new NPPMorphFilter(ksize, anchor, kernel, nppMorfFilter_callers[op][CV_MAT_CN(type)]));
+    return Ptr<BaseFilter_GPU>(new NPPMorphFilter(ksize, anchor, gpu_krnl, nppMorfFilter_callers[op][CV_MAT_CN(type)]));
 }
 namespace
@@ -447,10 +413,7 @@ Ptr<FilterEngine_GPU> cv::gpu::createMorphologyFilter_GPU(int op, int type, cons
    Size ksize = kernel.size();
-    GpuMat gpu_krnl;
+    Ptr<BaseFilter_GPU> filter2D = getMorphologyFilter_GPU(op, type, kernel, ksize, anchor);
-    normalizeKernel(kernel, gpu_krnl);
-    Ptr<BaseFilter_GPU> filter2D = getMorphologyFilter_GPU(op, type, gpu_krnl, ksize, anchor);
    return Ptr<FilterEngine_GPU>(new MorphologyFilterEngine_GPU(filter2D, iterations));
 }
@@ -575,27 +538,25 @@ namespace
    };
 }
-Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const GpuMat& kernel, const Size& ksize, Point anchor, int nDivisor)
+Ptr<BaseFilter_GPU> cv::gpu::getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Size& ksize, Point anchor)
 {
    static const nppFilter2D_t cppFilter2D_callers[] = {0, nppiFilter_8u_C1R, 0, 0, nppiFilter_8u_C4R};
-    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType); 
+    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && dstType == srcType);
-    CV_Assert(kernel.type() == CV_32SC1 && kernel.rows == 1 && kernel.cols == ksize.area());
+    GpuMat gpu_krnl;
+    int nDivisor;
+    normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, true);
    normalizeAnchor(anchor, ksize);
-    return Ptr<BaseFilter_GPU>(new NPPLinearFilter(ksize, anchor, kernel, nDivisor, cppFilter2D_callers[CV_MAT_CN(srcType)]));
+    return Ptr<BaseFilter_GPU>(new NPPLinearFilter(ksize, anchor, gpu_krnl, nDivisor, cppFilter2D_callers[CV_MAT_CN(srcType)]));
 }    
 Ptr<FilterEngine_GPU> cv::gpu::createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Point& anchor)
 {
    Size ksize = kernel.size();
-    GpuMat gpu_krnl;
+    Ptr<BaseFilter_GPU> linearFilter = getLinearFilter_GPU(srcType, dstType, kernel, ksize, anchor);
-    int nDivisor;
-    normalizeKernel(kernel, gpu_krnl, CV_32S, &nDivisor, true);
-    Ptr<BaseFilter_GPU> linearFilter = getLinearFilter_GPU(srcType, dstType, gpu_krnl, ksize, anchor, nDivisor);
    return createFilter2D_GPU(linearFilter);
 }
@@ -614,11 +575,26 @@ void cv::gpu::filter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& ke
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Separable Linear Filter
+namespace cv { namespace gpu { namespace filters
+{
+    void linearRowFilter_gpu_32s32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
+    void linearRowFilter_gpu_32s32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
+    void linearRowFilter_gpu_32f32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
+    void linearRowFilter_gpu_32f32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
+    void linearColumnFilter_gpu_32s32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
+    void linearColumnFilter_gpu_32s32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
+    void linearColumnFilter_gpu_32f32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
+    void linearColumnFilter_gpu_32f32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
+}}}
 namespace
 {
    typedef NppStatus (*nppFilter1D_t)(const Npp8u * pSrc, Npp32s nSrcStep, Npp8u * pDst, Npp32s nDstStep, NppiSize oROI, 
        const Npp32s * pKernel, Npp32s nMaskSize, Npp32s nAnchor, Npp32s nDivisor);
+    typedef void (*gpuFilter1D_t)(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor);
    class NppLinearRowFilter : public BaseRowFilter_GPU
    {
    public:
@@ -638,20 +614,64 @@ namespace
        Npp32s nDivisor;
        nppFilter1D_t func;
    };
+    class GpuLinearRowFilter : public BaseRowFilter_GPU
+    {
+    public:
+        GpuLinearRowFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_) : 
+            BaseRowFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {}
+        virtual void operator()(const GpuMat& src, GpuMat& dst)
+        {
+            func(src, dst, kernel.ptr<float>(), ksize, anchor);
+        }
+        Mat kernel;
+        gpuFilter1D_t func;
+    };
 }
-Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const GpuMat& rowKernel, int anchor, int nDivisor)
+Ptr<BaseRowFilter_GPU> cv::gpu::getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, int anchor)
 {
+    using namespace cv::gpu::filters;
    static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterRow_8u_C1R, 0, 0, nppiFilterRow_8u_C4R};
+    static const gpuFilter1D_t gpuFilter1D_callers[6][6] =
+    {
+        {0,0,0,0,0,0},
+        {0,0,0,0,0,0},
+        {0,0,0,0,0,0},
+        {0,0,0,0,0,0},
+        {0,0,0,0,linearRowFilter_gpu_32s32s, linearRowFilter_gpu_32s32f},
+        {0,0,0,0,linearRowFilter_gpu_32f32s, linearRowFilter_gpu_32f32f}
+    };
+    if ((srcType == CV_8UC1 || srcType == CV_8UC4) && bufType == srcType)
+    {
+        GpuMat gpu_row_krnl;
+        int nDivisor;
+        normalizeKernel(rowKernel, gpu_row_krnl, CV_32S, &nDivisor, true);
-    CV_Assert((srcType == CV_8UC1 || srcType == CV_8UC4) && bufType == srcType);
+        int ksize = gpu_row_krnl.cols;
-    CV_Assert(rowKernel.type() == CV_32SC1 && rowKernel.rows == 1);
+        normalizeAnchor(anchor, ksize);
-    int ksize = rowKernel.cols;
+        return Ptr<BaseRowFilter_GPU>(new NppLinearRowFilter(ksize, anchor, gpu_row_krnl, nDivisor,
+            nppFilter1D_callers[CV_MAT_CN(srcType)]));
+    }
+    else if ((srcType == CV_32SC1 || srcType == CV_32FC1) && (bufType == CV_32SC1 || bufType == CV_32FC1))
+    {
+        Mat temp(rowKernel.size(), CV_32FC1);
+        rowKernel.convertTo(temp, CV_32FC1);
+        Mat cont_krnl = temp.reshape(1, 1);
-    normalizeAnchor(anchor, ksize);
+        int ksize = cont_krnl.cols;
+        normalizeAnchor(anchor, ksize);
-    return Ptr<BaseRowFilter_GPU>(new NppLinearRowFilter(ksize, anchor, rowKernel, nDivisor, nppFilter1D_callers[CV_MAT_CN(srcType)]));
+        return Ptr<BaseRowFilter_GPU>(new GpuLinearRowFilter(ksize, anchor, cont_krnl, 
+            gpuFilter1D_callers[CV_MAT_DEPTH(srcType)][CV_MAT_DEPTH(bufType)]));
+    }
+    CV_Assert(!"Unsupported types");  
+    return Ptr<BaseRowFilter_GPU>(0);
 }
 namespace
@@ -675,49 +695,88 @@ namespace
        Npp32s nDivisor;
        nppFilter1D_t func;
    };
+    class GpuLinearColumnFilter : public BaseColumnFilter_GPU
+    {
+    public:
+        GpuLinearColumnFilter(int ksize_, int anchor_, const Mat& kernel_, gpuFilter1D_t func_) : 
+            BaseColumnFilter_GPU(ksize_, anchor_), kernel(kernel_), func(func_) {}
+        virtual void operator()(const GpuMat& src, GpuMat& dst)
+        {
+            func(src, dst, kernel.ptr<float>(), ksize, anchor);
+        }
+        Mat kernel;
+        gpuFilter1D_t func;
+    };
 }
-Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const GpuMat& columnKernel, int anchor, int nDivisor)
+Ptr<BaseColumnFilter_GPU> cv::gpu::getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, int anchor)
 {
+    using namespace cv::gpu::filters;
    static const nppFilter1D_t nppFilter1D_callers[] = {0, nppiFilterColumn_8u_C1R, 0, 0, nppiFilterColumn_8u_C4R};
+    static const gpuFilter1D_t gpuFilter1D_callers[6][6] =
+    {
+        {0,0,0,0,0,0},
+        {0,0,0,0,0,0},
+        {0,0,0,0,0,0},
+        {0,0,0,0,0,0},
+        {0,0,0,0,linearColumnFilter_gpu_32s32s, linearColumnFilter_gpu_32s32f},
+        {0,0,0,0,linearColumnFilter_gpu_32f32s, linearColumnFilter_gpu_32f32f}
+    };
+    if ((bufType == CV_8UC1 || bufType == CV_8UC4) && dstType == bufType)
+    {
+        GpuMat gpu_col_krnl;
+        int nDivisor;
+        normalizeKernel(columnKernel, gpu_col_krnl, CV_32S, &nDivisor, true);
-    CV_Assert((bufType == CV_8UC1 || bufType == CV_8UC4) && dstType == bufType);
+        int ksize = gpu_col_krnl.cols;
-    CV_Assert(columnKernel.type() == CV_32SC1 && columnKernel.rows == 1);
+        normalizeAnchor(anchor, ksize);
-    int ksize = columnKernel.cols;
+        return Ptr<BaseColumnFilter_GPU>(new NppLinearColumnFilter(ksize, anchor, gpu_col_krnl, nDivisor, 
+            nppFilter1D_callers[CV_MAT_CN(bufType)]));
+    }
+    else if ((bufType == CV_32SC1 || bufType == CV_32FC1) && (dstType == CV_32SC1 || dstType == CV_32FC1))
+    {
+        Mat temp(columnKernel.size(), CV_32FC1);
+        columnKernel.convertTo(temp, CV_32FC1);
+        Mat cont_krnl = temp.reshape(1, 1);
-    normalizeAnchor(anchor, ksize);
+        int ksize = cont_krnl.cols;
+        normalizeAnchor(anchor, ksize);
-    return Ptr<BaseColumnFilter_GPU>(new NppLinearColumnFilter(ksize, anchor, columnKernel, nDivisor, nppFilter1D_callers[CV_MAT_CN(bufType)]));
+        return Ptr<BaseColumnFilter_GPU>(new GpuLinearColumnFilter(ksize, anchor, cont_krnl, 
+            gpuFilter1D_callers[CV_MAT_DEPTH(bufType)][CV_MAT_DEPTH(dstType)]));
+    }
+    CV_Assert(!"Unsupported types");  
+    return Ptr<BaseColumnFilter_GPU>(0);
 }
 Ptr<FilterEngine_GPU> cv::gpu::createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, const Mat& columnKernel, 
-    const Point& anchor, bool rowFilterFirst)
+    const Point& anchor)
 {
    int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType);
    int cn = CV_MAT_CN(srcType);
    int bdepth = std::max(sdepth, ddepth);
    int bufType = CV_MAKETYPE(bdepth, cn);
-    GpuMat gpu_row_krnl, gpu_col_krnl;
+    Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, rowKernel, anchor.x);
-    int nRowDivisor, nColDivisor;
+    Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, columnKernel, anchor.y);
-    normalizeKernel(rowKernel, gpu_row_krnl, CV_32S, &nRowDivisor, true);
-    normalizeKernel(columnKernel, gpu_col_krnl, CV_32S, &nColDivisor, true);
-    Ptr<BaseRowFilter_GPU> rowFilter = getLinearRowFilter_GPU(srcType, bufType, gpu_row_krnl, anchor.x, nRowDivisor);
-    Ptr<BaseColumnFilter_GPU> columnFilter = getLinearColumnFilter_GPU(bufType, dstType, gpu_col_krnl, anchor.y, nColDivisor);
-    return createSeparableFilter_GPU(rowFilter, columnFilter, rowFilterFirst);
+    return createSeparableFilter_GPU(rowFilter, columnFilter);
 }
-void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor, bool rowFilterFirst)
+void cv::gpu::sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, Point anchor)
 {
    if( ddepth < 0 )
        ddepth = src.depth();
    dst.create(src.size(), CV_MAKETYPE(ddepth, src.channels()));
-    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor, rowFilterFirst);
+    Ptr<FilterEngine_GPU> f = createSeparableLinearFilter_GPU(src.type(), dst.type(), kernelX, kernelY, anchor);
    f->apply(src, dst);
 }
@@ -728,7 +787,7 @@ Ptr<FilterEngine_GPU> cv::gpu::createDerivFilter_GPU(int srcType, int dstType, i
 {
    Mat kx, ky;
    getDerivKernels(kx, ky, dx, dy, ksize, false, CV_32F);
-    return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1), dx >= dy);
+    return createSeparableLinearFilter_GPU(srcType, dstType, kx, ky, Point(-1,-1));
 }
 void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize, double scale)
@@ -746,7 +805,7 @@ void cv::gpu::Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy,
            ky *= scale;
    }
-    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), dx >= dy);
+    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1));
 }
 void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, double scale)
@@ -764,7 +823,7 @@ void cv::gpu::Scharr(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy,
            ky *= scale;
    }
-    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1), dx >= dy);
+    sepFilter2D(src, dst, ddepth, kx, ky, Point(-1,-1));
 }
 void cv::gpu::Laplacian(const GpuMat& src, GpuMat& dst, int ddepth, int ksize, double scale)

--- a/modules/gpu/src/imgproc_gpu.cpp
+++ b/modules/gpu/src/imgproc_gpu.cpp
@@ -75,7 +75,7 @@ void cv::gpu::histRange(const GpuMat&, GpuMat*, const GpuMat*) { throw_nogpu();
 namespace cv { namespace gpu 
 { 
-    namespace improc 
+    namespace imgproc 
    {
        void remap_gpu_1c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst);
        void remap_gpu_3c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst);
@@ -142,7 +142,7 @@ namespace cv { namespace gpu
 void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap)
 {
    typedef void (*remap_gpu_t)(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst);
-    static const remap_gpu_t callers[] = {improc::remap_gpu_1c, 0, improc::remap_gpu_3c};
+    static const remap_gpu_t callers[] = {imgproc::remap_gpu_1c, 0, imgproc::remap_gpu_3c};
    CV_Assert((src.type() == CV_8U || src.type() == CV_8UC3) && xmap.type() == CV_32F && ymap.type() == CV_32F);
@@ -180,7 +180,7 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
        eps = 1.f;
    eps = (float)std::max(criteria.epsilon, 0.0);        
-    improc::meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps);    
+    imgproc::meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps);    
 }
 ////////////////////////////////////////////////////////////////////////
@@ -207,7 +207,7 @@ void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int
        eps = 1.f;
    eps = (float)std::max(criteria.epsilon, 0.0);        
-    improc::meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps);    
+    imgproc::meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps);    
 }
 ////////////////////////////////////////////////////////////////////////
@@ -223,7 +223,7 @@ namespace
            out = dst;
        out.create(src.size(), CV_8UC4);
-        improc::drawColorDisp_gpu((DevMem2D_<T>)src, out, ndisp, stream);
+        imgproc::drawColorDisp_gpu((DevMem2D_<T>)src, out, ndisp, stream);
        dst = out;
    }
@@ -256,7 +256,7 @@ namespace
    void reprojectImageTo3D_caller(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const cudaStream_t& stream)
    {        
        xyzw.create(disp.rows, disp.cols, CV_32FC4);
-        improc::reprojectImageTo3D_gpu((DevMem2D_<T>)disp, xyzw, Q.ptr<float>(), stream);
+        imgproc::reprojectImageTo3D_gpu((DevMem2D_<T>)disp, xyzw, Q.ptr<float>(), stream);
    }
    typedef void (*reprojectImageTo3D_caller_t)(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const cudaStream_t& stream);
@@ -313,7 +313,7 @@ namespace
            case CV_RGBA2BGR: case CV_RGB2BGR: case CV_BGRA2RGBA:                
                {
                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, cudaStream_t stream);
-                    static const func_t funcs[] = {improc::RGB2RGB_gpu_8u, 0, improc::RGB2RGB_gpu_16u, 0, 0, improc::RGB2RGB_gpu_32f};
+                    static const func_t funcs[] = {imgproc::RGB2RGB_gpu_8u, 0, imgproc::RGB2RGB_gpu_16u, 0, 0, imgproc::RGB2RGB_gpu_32f};
                    CV_Assert(scn == 3 || scn == 4);
@@ -338,7 +338,7 @@ namespace
                    dst.create(sz, CV_8UC2);
-                    improc::RGB2RGB5x5_gpu(src, scn, dst, green_bits, bidx, stream);
+                    imgproc::RGB2RGB5x5_gpu(src, scn, dst, green_bits, bidx, stream);
                    break;
                }
@@ -356,14 +356,14 @@ namespace
                    dst.create(sz, CV_MAKETYPE(depth, dcn));
-                    improc::RGB5x52RGB_gpu(src, green_bits, dst, dcn, bidx, stream);
+                    imgproc::RGB5x52RGB_gpu(src, green_bits, dst, dcn, bidx, stream);
                    break;
                }
            case CV_BGR2GRAY: case CV_BGRA2GRAY: case CV_RGB2GRAY: case CV_RGBA2GRAY:
                {
                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream);
-                    static const func_t funcs[] = {improc::RGB2Gray_gpu_8u, 0, improc::RGB2Gray_gpu_16u, 0, 0, improc::RGB2Gray_gpu_32f};
+                    static const func_t funcs[] = {imgproc::RGB2Gray_gpu_8u, 0, imgproc::RGB2Gray_gpu_16u, 0, 0, imgproc::RGB2Gray_gpu_32f};
                    CV_Assert(scn == 3 || scn == 4);
@@ -383,14 +383,14 @@ namespace
                    dst.create(sz, CV_8UC1);
-                    improc::RGB5x52Gray_gpu(src, green_bits, dst, stream);
+                    imgproc::RGB5x52Gray_gpu(src, green_bits, dst, stream);
                    break;
                }
            case CV_GRAY2BGR: case CV_GRAY2BGRA:
                {
                    typedef void (*func_t)(const DevMem2D& src, const DevMem2D& dst, int dstcn, cudaStream_t stream);
-                    static const func_t funcs[] = {improc::Gray2RGB_gpu_8u, 0, improc::Gray2RGB_gpu_16u, 0, 0, improc::Gray2RGB_gpu_32f};
+                    static const func_t funcs[] = {imgproc::Gray2RGB_gpu_8u, 0, imgproc::Gray2RGB_gpu_16u, 0, 0, imgproc::Gray2RGB_gpu_32f};
                    if (dcn <= 0) dcn = 3;
@@ -410,7 +410,7 @@ namespace
                    dst.create(sz, CV_8UC2);
-                    improc::Gray2RGB5x5_gpu(src, dst, green_bits, stream);
+                    imgproc::Gray2RGB5x5_gpu(src, dst, green_bits, stream);
                    break;
                }
@@ -419,7 +419,7 @@ namespace
                {
                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, 
                        const void* coeffs, cudaStream_t stream);
-                    static const func_t funcs[] = {improc::RGB2YCrCb_gpu_8u, 0, improc::RGB2YCrCb_gpu_16u, 0, 0, improc::RGB2YCrCb_gpu_32f};
+                    static const func_t funcs[] = {imgproc::RGB2YCrCb_gpu_8u, 0, imgproc::RGB2YCrCb_gpu_16u, 0, 0, imgproc::RGB2YCrCb_gpu_32f};
                    if (dcn <= 0) dcn = 3;
                    CV_Assert((scn == 3 || scn == 4) && (dcn == 3 || dcn == 4));
@@ -456,7 +456,7 @@ namespace
                {
                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, 
                        const void* coeffs, cudaStream_t stream);
-                    static const func_t funcs[] = {improc::YCrCb2RGB_gpu_8u, 0, improc::YCrCb2RGB_gpu_16u, 0, 0, improc::YCrCb2RGB_gpu_32f};
+                    static const func_t funcs[] = {imgproc::YCrCb2RGB_gpu_8u, 0, imgproc::YCrCb2RGB_gpu_16u, 0, 0, imgproc::YCrCb2RGB_gpu_32f};
                    if (dcn <= 0) dcn = 3;
@@ -485,7 +485,7 @@ namespace
                {
                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, 
                        const void* coeffs, cudaStream_t stream);
-                    static const func_t funcs[] = {improc::RGB2XYZ_gpu_8u, 0, improc::RGB2XYZ_gpu_16u, 0, 0, improc::RGB2XYZ_gpu_32f};
+                    static const func_t funcs[] = {imgproc::RGB2XYZ_gpu_8u, 0, imgproc::RGB2XYZ_gpu_16u, 0, 0, imgproc::RGB2XYZ_gpu_32f};
                    if (dcn <= 0) dcn = 3;
@@ -534,7 +534,7 @@ namespace
                {
                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, 
                        const void* coeffs, cudaStream_t stream);
-                    static const func_t funcs[] = {improc::XYZ2RGB_gpu_8u, 0, improc::XYZ2RGB_gpu_16u, 0, 0, improc::XYZ2RGB_gpu_32f};
+                    static const func_t funcs[] = {imgproc::XYZ2RGB_gpu_8u, 0, imgproc::XYZ2RGB_gpu_16u, 0, 0, imgproc::XYZ2RGB_gpu_32f};
                    if (dcn <= 0) dcn = 3;
@@ -584,8 +584,8 @@ namespace
                {
                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, 
                        int hrange, cudaStream_t stream);
-                    static const func_t funcs_hsv[] = {improc::RGB2HSV_gpu_8u, 0, 0, 0, 0, improc::RGB2HSV_gpu_32f};
+                    static const func_t funcs_hsv[] = {imgproc::RGB2HSV_gpu_8u, 0, 0, 0, 0, imgproc::RGB2HSV_gpu_32f};
-                    static const func_t funcs_hls[] = {improc::RGB2HLS_gpu_8u, 0, 0, 0, 0, improc::RGB2HLS_gpu_32f};
+                    static const func_t funcs_hls[] = {imgproc::RGB2HLS_gpu_8u, 0, 0, 0, 0, imgproc::RGB2HLS_gpu_32f};
                    if (dcn <= 0) dcn = 3;
@@ -610,8 +610,8 @@ namespace
                {
                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, 
                        int hrange, cudaStream_t stream);
-                    static const func_t funcs_hsv[] = {improc::HSV2RGB_gpu_8u, 0, 0, 0, 0, improc::HSV2RGB_gpu_32f};
+                    static const func_t funcs_hsv[] = {imgproc::HSV2RGB_gpu_8u, 0, 0, 0, 0, imgproc::HSV2RGB_gpu_32f};
-                    static const func_t funcs_hls[] = {improc::HLS2RGB_gpu_8u, 0, 0, 0, 0, improc::HLS2RGB_gpu_32f};
+                    static const func_t funcs_hls[] = {imgproc::HLS2RGB_gpu_8u, 0, 0, 0, 0, imgproc::HLS2RGB_gpu_32f};
                    if (dcn <= 0) dcn = 3;

--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/gpu/src/matrix_operations.cpp
@@ -77,6 +77,22 @@ namespace cv
 #else /* !defined (HAVE_CUDA) */
+namespace cv 
+{
+    namespace gpu
+    {
+        namespace matrix_operations
+        {            
+            void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
+            void set_to_without_mask (DevMem2D dst, int depth, const double *scalar, int channels, const cudaStream_t & stream = 0);
+            void set_to_with_mask    (DevMem2D dst, int depth, const double *scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
+            void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream = 0);
+        }
+    }
+}
 void cv::gpu::GpuMat::upload(const Mat& m)
 {
    CV_DbgAssert(!m.empty());

--- a/tests/gpu/src/gputest_main.cpp
+++ b/tests/gpu/src/gputest_main.cpp
@@ -53,7 +53,6 @@ const char* blacklist[] =
    //"GPU-NppImageMeanStdDev",       // different precision
    //"GPU-NppImageExp",              // different precision
    //"GPU-NppImageLog",              // different precision
-    //"GPU-NppImageMagnitude",        // different precision
    "GPU-NppImageCanny",            // NPP_TEXTURE_BIND_ERROR
    //"GPU-NppImageResize",           // different precision
@@ -61,8 +60,8 @@ const char* blacklist[] =
    //"GPU-NppImageWarpPerspective",  // different precision
    //"GPU-NppImageIntegral",         // different precision
-    //"GPU-NppImageSobel",            // ???
+    //"GPU-NppImageSobel",            // sign error
-    //"GPU-NppImageScharr",           // ???    
+    //"GPU-NppImageScharr",           // sign error    
    //"GPU-NppImageGaussianBlur",     // different precision 
    0
 };