added to gpu module linear filters for int and float source types.

refactored gpu module.

added to gpu module linear filters for int and float source types.
refactored gpu module.
b08f6082 · Vladislav Vinogradov · ea040ce7 · b08f6082 · b08f6082 · b08f6082
Commit b08f6082 authored Oct 20, 2010 by Vladislav Vinogradov
18 changed files
--- a/modules/gpu/include/opencv2/gpu/devmem2d.hpp
+++ b/modules/gpu/include/opencv2/gpu/devmem2d.hpp
@@ -50,7 +50,7 @@ namespace cv
        // Simple lightweight structure that encapsulates image ptr on device, its pitch and its sizes.
        // It is intended to pass to nvcc-compiled code. GpuMat depends on headers that nvcc can't compile
-        template<typename T = unsigned char>
+        template <typename T>
        struct DevMem2D_
        {
            typedef T elem_t;
@@ -60,16 +60,21 @@ namespace cv
            int rows;
            T* ptr;
            size_t step;
+            size_t elem_step;
-            DevMem2D_() : cols(0), rows(0), ptr(0), step(0) {}
+            DevMem2D_() : cols(0), rows(0), ptr(0), step(0), elem_step(0) {}
            DevMem2D_(int rows_, int cols_, T *ptr_, size_t step_)
-                : cols(cols_), rows(rows_), ptr(ptr_), step(step_) {}
+                : cols(cols_), rows(rows_), ptr(ptr_), step(step_), elem_step(step_ / sizeof(T)) {}
+            template <typename U>
+            explicit DevMem2D_(const DevMem2D_<U>& d)
+                : cols(d.cols), rows(d.rows), ptr((T*)d.ptr), step(d.step), elem_step(d.step / sizeof(T)) {}
            size_t elemSize() const { return elem_size; }
        };
-        typedef DevMem2D_<> DevMem2D;
+        typedef DevMem2D_<unsigned char> DevMem2D;
        typedef DevMem2D_<float> DevMem2Df;
        typedef DevMem2D_<int> DevMem2Di;
    }

--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -636,7 +636,7 @@ namespace cv
        //! returns the separable filter engine with the specified filters
        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableFilter_GPU(const Ptr<BaseRowFilter_GPU>& rowFilter, 
-            const Ptr<BaseColumnFilter_GPU>& columnFilter, bool rowFilterFirst = true);
+            const Ptr<BaseColumnFilter_GPU>& columnFilter);
        //! returns horizontal 1D box filter
        //! supports only CV_8UC1 source type and CV_32FC1 sum type
@@ -658,7 +658,7 @@ namespace cv
        //! only MORPH_ERODE and MORPH_DILATE are supported
        //! supports CV_8UC1 and CV_8UC4 types
        //! kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height
-        CV_EXPORTS Ptr<BaseFilter_GPU> getMorphologyFilter_GPU(int op, int type, const GpuMat& kernel, const Size& ksize, 
+        CV_EXPORTS Ptr<BaseFilter_GPU> getMorphologyFilter_GPU(int op, int type, const Mat& kernel, const Size& ksize, 
            Point anchor=Point(-1,-1));
        //! returns morphological filter engine. Only MORPH_ERODE and MORPH_DILATE are supported.
@@ -667,25 +667,24 @@ namespace cv
        //! returns 2D filter with the specified kernel
        //! supports CV_8UC1 and CV_8UC4 types
-        //! kernel must have CV_8UC1 type, one rows and cols == ksize.width * ksize.height
+        CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, const Size& ksize, 
-        CV_EXPORTS Ptr<BaseFilter_GPU> getLinearFilter_GPU(int srcType, int dstType, const GpuMat& kernel, const Size& ksize, 
+            Point anchor = Point(-1, -1));
-            Point anchor = Point(-1, -1), int nDivisor = 1);
        //! returns the non-separable linear filter engine
        CV_EXPORTS Ptr<FilterEngine_GPU> createLinearFilter_GPU(int srcType, int dstType, const Mat& kernel, 
            const Point& anchor = Point(-1,-1));
        //! returns the primitive row filter with the specified kernel
-        CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const GpuMat& rowKernel, 
+        CV_EXPORTS Ptr<BaseRowFilter_GPU> getLinearRowFilter_GPU(int srcType, int bufType, const Mat& rowKernel, 
-            int anchor = -1, int nDivisor = 1);
+            int anchor = -1);
        //! returns the primitive column filter with the specified kernel
-        CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const GpuMat& columnKernel, 
+        CV_EXPORTS Ptr<BaseColumnFilter_GPU> getLinearColumnFilter_GPU(int bufType, int dstType, const Mat& columnKernel, 
-            int anchor = -1, int nDivisor = 1);
+            int anchor = -1);
        //! returns the separable linear filter engine
        CV_EXPORTS Ptr<FilterEngine_GPU> createSeparableLinearFilter_GPU(int srcType, int dstType, const Mat& rowKernel, 
-            const Mat& columnKernel, const Point& anchor = Point(-1,-1), bool rowFilterFirst = true);
+            const Mat& columnKernel, const Point& anchor = Point(-1,-1));
        //! returns filter engine for the generalized Sobel operator
        CV_EXPORTS Ptr<FilterEngine_GPU> createDerivFilter_GPU(int srcType, int dstType, int dx, int dy, int ksize);
@@ -720,7 +719,7 @@ namespace cv
        //! applies separable 2D linear filter to the image
        CV_EXPORTS void sepFilter2D(const GpuMat& src, GpuMat& dst, int ddepth, const Mat& kernelX, const Mat& kernelY, 
-            Point anchor = Point(-1,-1), bool rowFilterFirst = true);
+            Point anchor = Point(-1,-1));
        //! applies generalized Sobel operator to the image
        CV_EXPORTS void Sobel(const GpuMat& src, GpuMat& dst, int ddepth, int dx, int dy, int ksize = 3, double scale = 1);

--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -316,9 +316,9 @@ void cv::gpu::absdiff(const GpuMat& src, const Scalar& s, GpuMat& dst)
 ////////////////////////////////////////////////////////////////////////
 // compare
-namespace cv { namespace gpu { namespace matrix_operations
+namespace cv { namespace gpu { namespace mathfunc
 {
-    void compare_ne_8u(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst);
+    void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst);
    void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst);
 }}}
@@ -346,7 +346,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
        }
        else
        {
-            matrix_operations::compare_ne_8u(src1, src2, dst);
+            mathfunc::compare_ne_8uc4(src1, src2, dst);
        }
    }
    else
@@ -359,7 +359,7 @@ void cv::gpu::compare(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, int c
        }
        else
        {
-            matrix_operations::compare_ne_32f(src1, src2, dst);
+            mathfunc::compare_ne_32f(src1, src2, dst);
        }
    }
 }

--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
--- a/modules/gpu/src/cuda/constantspacebp.cu
+++ b/modules/gpu/src/cuda/constantspacebp.cu
@@ -54,20 +54,18 @@ using namespace cv::gpu;
 #define SHRT_MAX 32767
 #endif
-template <typename T>
+namespace csbp_krnls
-struct TypeLimits {};
-template <>
-struct TypeLimits<short>
-{
-    static __device__ short max() {return SHRT_MAX;}
-};
-template <>
-struct TypeLimits<float>
 {
-    static __device__ float max() {return FLT_MAX;}
+    template <typename T> struct TypeLimits;
-};
+    template <> struct TypeLimits<short>
+    {
+        static __device__ short max() {return SHRT_MAX;}
+    };
+    template <> struct TypeLimits<float>
+    {
+        static __device__ float max() {return FLT_MAX;}
+    };
+}
 ///////////////////////////////////////////////////////////////
 /////////////////////// load constants ////////////////////////

--- a/modules/gpu/src/cuda/cuda_shared.hpp
+++ b/modules/gpu/src/cuda/cuda_shared.hpp
@@ -58,19 +58,8 @@ namespace cv
        static inline int divUp(int a, int b) { return (a % b == 0) ? a/b : a/b + 1; }
-        namespace matrix_operations
-        {            
-            extern "C" void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
-            extern "C" void set_to_without_mask (DevMem2D dst, int depth, const double *scalar, int channels, const cudaStream_t & stream = 0);
-            extern "C" void set_to_with_mask    (DevMem2D dst, int depth, const double *scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
-            extern "C" void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream = 0);
-        }
        template<class T> 
-        inline void uploadConstant(const char* name, const T& value) { cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); }
+        static inline void uploadConstant(const char* name, const T& value) { cudaSafeCall( cudaMemcpyToSymbol(name, &value, sizeof(T)) ); }
    }
 }

--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
@@ -43,6 +43,7 @@
 #include "opencv2/gpu/devmem2d.hpp"
 #include "saturate_cast.hpp"
 #include "safe_call.hpp"
+#include "cuda_shared.hpp"
 using namespace cv::gpu;
@@ -50,6 +51,227 @@ using namespace cv::gpu;
 #define FLT_MAX 3.402823466e+30F
 #endif
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Linear filters
+#define MAX_KERNEL_SIZE 16
+namespace filter_krnls
+{
+    __constant__ float cLinearKernel[MAX_KERNEL_SIZE];
+}
+namespace cv { namespace gpu { namespace filters
+{
+    void loadLinearKernel(const float kernel[], int ksize)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(filter_krnls::cLinearKernel, kernel, ksize * sizeof(float)) );
+    }
+}}}
+namespace filter_krnls
+{
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int KERNEL_SIZE, typename T, typename D>
+    __global__ void linearRowFilter(const T* src, size_t src_step, D* dst, size_t dst_step, int anchor, int width, int height)
+    {
+        __shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
+        const int blockStartX = blockDim.x * blockIdx.x;
+        const int blockStartY = blockDim.y * blockIdx.y;
+		const int threadX = blockStartX + threadIdx.x;
+        const int prevThreadX = threadX - blockDim.x;
+        const int nextThreadX = threadX + blockDim.x;
+		const int threadY = blockStartY + threadIdx.y;
+        T* sDataRow = smem + threadIdx.y * blockDim.x * 3;
+        if (threadY < height)
+        {
+            const T* rowSrc = src + threadY * src_step;
+            sDataRow[threadIdx.x + blockDim.x] = threadX < width ? rowSrc[threadX] : 0;
+            sDataRow[threadIdx.x] = prevThreadX >= 0 ? rowSrc[prevThreadX] : 0;
+            sDataRow[(blockDim.x << 1) + threadIdx.x] = nextThreadX < width ? rowSrc[nextThreadX] : 0;
+            __syncthreads();
+            if (threadX < width)
+            {
+                float sum = 0;
+                sDataRow += threadIdx.x + blockDim.x - anchor;
+                #pragma unroll
+                for(int i = 0; i < KERNEL_SIZE; ++i)
+                    sum += cLinearKernel[i] * sDataRow[i];
+                dst[threadY * dst_step + threadX] = saturate_cast<D>(sum);
+            }
+        }
+    }
+}
+namespace cv { namespace gpu { namespace filters
+{
+    template <int KERNEL_SIZE, typename T, typename D>
+    void linearRowFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor)
+    {
+        const int BLOCK_DIM_X = 16;
+        const int BLOCK_DIM_Y = 16;
+        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
+        dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
+        filter_krnls::linearRowFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE><<<blocks, threads>>>(src.ptr, src.elem_step, 
+            dst.ptr, dst.elem_step, anchor, src.cols, src.rows);
+        cudaSafeCall( cudaThreadSynchronize() );
+    }
+    template <typename T, typename D>
+    inline void linearRowFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor);
+        static const caller_t callers[] = 
+        {linearRowFilter_caller<0 , T, D>, linearRowFilter_caller<1 , T, D>, 
+         linearRowFilter_caller<2 , T, D>, linearRowFilter_caller<3 , T, D>, 
+         linearRowFilter_caller<4 , T, D>, linearRowFilter_caller<5 , T, D>, 
+         linearRowFilter_caller<6 , T, D>, linearRowFilter_caller<7 , T, D>, 
+         linearRowFilter_caller<8 , T, D>, linearRowFilter_caller<9 , T, D>, 
+         linearRowFilter_caller<10, T, D>, linearRowFilter_caller<11, T, D>, 
+         linearRowFilter_caller<12, T, D>, linearRowFilter_caller<13, T, D>, 
+         linearRowFilter_caller<14, T, D>, linearRowFilter_caller<15, T, D>};
+        loadLinearKernel(kernel, ksize);
+        callers[ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);
+    }
+    void linearRowFilter_gpu_32s32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearRowFilter_gpu<int, int>(src, dst, kernel, ksize, anchor);
+    }
+    void linearRowFilter_gpu_32s32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearRowFilter_gpu<int, float>(src, dst, kernel, ksize, anchor);
+    }
+    void linearRowFilter_gpu_32f32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearRowFilter_gpu<float, int>(src, dst, kernel, ksize, anchor);
+    }
+    void linearRowFilter_gpu_32f32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearRowFilter_gpu<float, float>(src, dst, kernel, ksize, anchor);
+    }
+}}}
+namespace filter_krnls
+{
+    template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int KERNEL_SIZE, typename T, typename D>
+    __global__ void linearColumnFilter(const T* src, size_t src_step, D* dst, size_t dst_step, int anchor, int width, int height)
+    {
+        __shared__ T smem[BLOCK_DIM_Y * BLOCK_DIM_X * 3];
+        const int blockStartX = blockDim.x * blockIdx.x;
+        const int blockStartY = blockDim.y * blockIdx.y;
+		const int threadX = blockStartX + threadIdx.x;
+		const int threadY = blockStartY + threadIdx.y;
+        const int prevThreadY = threadY - blockDim.y;
+        const int nextThreadY = threadY + blockDim.y;
+        const int smem_step = blockDim.x;
+        T* sDataColumn = smem + threadIdx.x;
+        if (threadX < width)
+        {
+            const T* colSrc = src + threadX;
+            sDataColumn[(threadIdx.y + blockDim.y) * smem_step] = threadY < height ? colSrc[threadY * src_step] : 0;
+            sDataColumn[threadIdx.y * smem_step] = prevThreadY >= 0 ? colSrc[prevThreadY * src_step] : 0;
+            sDataColumn[(threadIdx.y + (blockDim.y << 1)) * smem_step] = nextThreadY < height ? colSrc[nextThreadY * src_step] : 0;
+            __syncthreads();
+            if (threadY < height)
+            {
+                float sum = 0;
+                sDataColumn += (threadIdx.y + blockDim.y - anchor)* smem_step;
+                #pragma unroll
+                for(int i = 0; i < KERNEL_SIZE; ++i)
+                    sum += cLinearKernel[i] * sDataColumn[i * smem_step];
+                dst[threadY * dst_step + threadX] = saturate_cast<D>(sum);
+            }
+        }
+    }
+}
+namespace cv { namespace gpu { namespace filters
+{
+    template <int KERNEL_SIZE, typename T, typename D>
+    void linearColumnFilter_caller(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor)
+    {
+        const int BLOCK_DIM_X = 16;
+        const int BLOCK_DIM_Y = 16;
+        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
+        dim3 blocks(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
+        filter_krnls::linearColumnFilter<BLOCK_DIM_X, BLOCK_DIM_Y, KERNEL_SIZE><<<blocks, threads>>>(src.ptr, src.elem_step, 
+            dst.ptr, dst.elem_step, anchor, src.cols, src.rows);
+        cudaSafeCall( cudaThreadSynchronize() );
+    }
+    template <typename T, typename D>
+    inline void linearColumnFilter_gpu(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, int anchor);
+        static const caller_t callers[] = 
+        {linearColumnFilter_caller<0 , T, D>, linearColumnFilter_caller<1 , T, D>, 
+         linearColumnFilter_caller<2 , T, D>, linearColumnFilter_caller<3 , T, D>, 
+         linearColumnFilter_caller<4 , T, D>, linearColumnFilter_caller<5 , T, D>, 
+         linearColumnFilter_caller<6 , T, D>, linearColumnFilter_caller<7 , T, D>, 
+         linearColumnFilter_caller<8 , T, D>, linearColumnFilter_caller<9 , T, D>, 
+         linearColumnFilter_caller<10, T, D>, linearColumnFilter_caller<11, T, D>, 
+         linearColumnFilter_caller<12, T, D>, linearColumnFilter_caller<13, T, D>, 
+         linearColumnFilter_caller<14, T, D>, linearColumnFilter_caller<15, T, D>};
+        loadLinearKernel(kernel, ksize);
+        callers[ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor);
+    }
+    void linearColumnFilter_gpu_32s32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearColumnFilter_gpu<int, int>(src, dst, kernel, ksize, anchor);
+    }
+    void linearColumnFilter_gpu_32s32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearColumnFilter_gpu<int, float>(src, dst, kernel, ksize, anchor);
+    }
+    void linearColumnFilter_gpu_32f32s(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearColumnFilter_gpu<float, int>(src, dst, kernel, ksize, anchor);
+    }
+    void linearColumnFilter_gpu_32f32f(const DevMem2D& src, const DevMem2D& dst, const float kernel[], int ksize, int anchor)
+    {
+        linearColumnFilter_gpu<float, float>(src, dst, kernel, ksize, anchor);
+    }
+}}}
+/////////////////////////////////////////////////////////////////////////////////////////////////
+// Bilateral filters
 namespace bf_krnls
 {
    __constant__ float* ctable_color;

--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -45,7 +45,7 @@
 using namespace cv::gpu;
 /////////////////////////////////// Remap ///////////////////////////////////////////////
-namespace imgproc
+namespace imgproc_krnls
 {
    texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex_remap;
@@ -123,7 +123,7 @@ namespace imgproc
    }
 }
-namespace cv { namespace gpu { namespace improc 
+namespace cv { namespace gpu { namespace imgproc 
 {
    void remap_gpu_1c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst)
    {
@@ -132,15 +132,15 @@ namespace cv { namespace gpu { namespace improc
        grid.x = divUp(dst.cols, threads.x);
        grid.y = divUp(dst.rows, threads.y);
-        imgproc::tex_remap.filterMode = cudaFilterModeLinear;	    
+        imgproc_krnls::tex_remap.filterMode = cudaFilterModeLinear;	    
-        imgproc::tex_remap.addressMode[0] = imgproc::tex_remap.addressMode[1] = cudaAddressModeWrap;
+        imgproc_krnls::tex_remap.addressMode[0] = imgproc_krnls::tex_remap.addressMode[1] = cudaAddressModeWrap;
        cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
-        cudaSafeCall( cudaBindTexture2D(0, imgproc::tex_remap, src.ptr, desc, src.cols, src.rows, src.step) );
+        cudaSafeCall( cudaBindTexture2D(0, imgproc_krnls::tex_remap, src.ptr, desc, src.cols, src.rows, src.step) );
-        imgproc::remap_1c<<<grid, threads>>>(xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows);
+        imgproc_krnls::remap_1c<<<grid, threads>>>(xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows);
        cudaSafeCall( cudaThreadSynchronize() );  
-        cudaSafeCall( cudaUnbindTexture(imgproc::tex_remap) );
+        cudaSafeCall( cudaUnbindTexture(imgproc_krnls::tex_remap) );
    }
    void remap_gpu_3c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst)
@@ -150,7 +150,7 @@ namespace cv { namespace gpu { namespace improc
        grid.x = divUp(dst.cols, threads.x);
        grid.y = divUp(dst.rows, threads.y);
-        imgproc::remap_3c<<<grid, threads>>>(src.ptr, src.step, xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows);
+        imgproc_krnls::remap_3c<<<grid, threads>>>(src.ptr, src.step, xmap.ptr, ymap.ptr, xmap.step, dst.ptr, dst.step, dst.cols, dst.rows);
        cudaSafeCall( cudaThreadSynchronize() ); 
    }
@@ -159,7 +159,7 @@ namespace cv { namespace gpu { namespace improc
 /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
-namespace imgproc
+namespace imgproc_krnls
 {
    texture<uchar4, 2> tex_meanshift;
@@ -254,7 +254,7 @@ namespace imgproc
    }
 }
-namespace cv { namespace gpu { namespace improc 
+namespace cv { namespace gpu { namespace imgproc 
 {
    extern "C" void meanShiftFiltering_gpu(const DevMem2D& src, DevMem2D dst, int sp, int sr, int maxIter, float eps)
    {                        
@@ -264,11 +264,11 @@ namespace cv { namespace gpu { namespace improc
        grid.y = divUp(src.rows, threads.y);
        cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
-        cudaSafeCall( cudaBindTexture2D( 0, imgproc::tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) );
+        cudaSafeCall( cudaBindTexture2D( 0, imgproc_krnls::tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) );
-        imgproc::meanshift_kernel<<< grid, threads >>>( dst.ptr, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
+        imgproc_krnls::meanshift_kernel<<< grid, threads >>>( dst.ptr, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
        cudaSafeCall( cudaThreadSynchronize() );
-        cudaSafeCall( cudaUnbindTexture( imgproc::tex_meanshift ) );        
+        cudaSafeCall( cudaUnbindTexture( imgproc_krnls::tex_meanshift ) );        
    }
    extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps) 
    {
@@ -278,17 +278,17 @@ namespace cv { namespace gpu { namespace improc
        grid.y = divUp(src.rows, threads.y);
        cudaChannelFormatDesc desc = cudaCreateChannelDesc<uchar4>();
-        cudaSafeCall( cudaBindTexture2D( 0, imgproc::tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) );
+        cudaSafeCall( cudaBindTexture2D( 0, imgproc_krnls::tex_meanshift, src.ptr, desc, src.cols, src.rows, src.step ) );
-        imgproc::meanshiftproc_kernel<<< grid, threads >>>( dstr.ptr, dstr.step, dstsp.ptr, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
+        imgproc_krnls::meanshiftproc_kernel<<< grid, threads >>>( dstr.ptr, dstr.step, dstsp.ptr, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
        cudaSafeCall( cudaThreadSynchronize() );
-        cudaSafeCall( cudaUnbindTexture( imgproc::tex_meanshift ) );        
+        cudaSafeCall( cudaUnbindTexture( imgproc_krnls::tex_meanshift ) );        
    }
 }}}
 /////////////////////////////////// drawColorDisp ///////////////////////////////////////////////
-namespace imgproc
+namespace imgproc_krnls
 {
    template <typename T>
    __device__ unsigned int cvtPixel(T d, int ndisp, float S = 1, float V = 1)
@@ -391,7 +391,7 @@ namespace imgproc
    }
 }
-namespace cv { namespace gpu { namespace improc 
+namespace cv { namespace gpu { namespace imgproc 
 {
    void drawColorDisp_gpu(const DevMem2D& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream)
    {
@@ -400,7 +400,7 @@ namespace cv { namespace gpu { namespace improc
        grid.x = divUp(src.cols, threads.x << 2);
        grid.y = divUp(src.rows, threads.y);
-        imgproc::drawColorDisp<<<grid, threads, 0, stream>>>(src.ptr, src.step, dst.ptr, dst.step, src.cols, src.rows, ndisp);
+        imgproc_krnls::drawColorDisp<<<grid, threads, 0, stream>>>(src.ptr, src.step, dst.ptr, dst.step, src.cols, src.rows, ndisp);
        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() ); 
@@ -413,7 +413,7 @@ namespace cv { namespace gpu { namespace improc
        grid.x = divUp(src.cols, threads.x << 1);
        grid.y = divUp(src.rows, threads.y);
-        imgproc::drawColorDisp<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(short), dst.ptr, dst.step, src.cols, src.rows, ndisp);
+        imgproc_krnls::drawColorDisp<<<grid, threads, 0, stream>>>(src.ptr, src.step / sizeof(short), dst.ptr, dst.step, src.cols, src.rows, ndisp);
        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -422,7 +422,7 @@ namespace cv { namespace gpu { namespace improc
 /////////////////////////////////// reprojectImageTo3D ///////////////////////////////////////////////
-namespace imgproc
+namespace imgproc_krnls
 {
    __constant__ float cq[16];
@@ -457,7 +457,7 @@ namespace imgproc
    }
 }
-namespace cv { namespace gpu { namespace improc 
+namespace cv { namespace gpu { namespace imgproc 
 {
    template <typename T>
    inline void reprojectImageTo3D_caller(const DevMem2D_<T>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream)
@@ -467,9 +467,9 @@ namespace cv { namespace gpu { namespace improc
        grid.x = divUp(disp.cols, threads.x);
        grid.y = divUp(disp.rows, threads.y);
-        cudaSafeCall( cudaMemcpyToSymbol(imgproc::cq, q, 16 * sizeof(float)) );
+        cudaSafeCall( cudaMemcpyToSymbol(imgproc_krnls::cq, q, 16 * sizeof(float)) );
-        imgproc::reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.ptr, disp.step / sizeof(T), xyzw.ptr, xyzw.step / sizeof(float), disp.rows, disp.cols);
+        imgproc_krnls::reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.ptr, disp.step / sizeof(T), xyzw.ptr, xyzw.step / sizeof(float), disp.rows, disp.cols);
        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );

--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -41,6 +41,9 @@
 //M*/
 #include "cuda_shared.hpp"
+#include "saturate_cast.hpp"
+#include "transform.hpp"
+#include "vecmath.hpp"
 using namespace cv::gpu;
@@ -48,6 +51,9 @@ using namespace cv::gpu;
 #define CV_PI   3.1415926535897932384626433832795f
 #endif
+//////////////////////////////////////////////////////////////////////////////////////
+// Cart <-> Polar
 namespace mathfunc_krnls 
 {
    struct Nothing
@@ -143,8 +149,8 @@ namespace cv { namespace gpu { namespace mathfunc
        const float scale = angleInDegrees ? (float)(180.0f / CV_PI) : 1.f;
        mathfunc_krnls::cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
-            x.ptr, x.step / sizeof(float), y.ptr, y.step / sizeof(float), 
+            x.ptr, x.elem_step, y.ptr, y.elem_step, 
-            mag.ptr, mag.step / sizeof(float), angle.ptr, angle.step / sizeof(float), scale, x.cols, x.rows);
+            mag.ptr, mag.elem_step, angle.ptr, angle.elem_step, scale, x.cols, x.rows);
        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -191,8 +197,8 @@ namespace cv { namespace gpu { namespace mathfunc
        const float scale = angleInDegrees ? (float)(CV_PI / 180.0f) : 1.0f;
-        mathfunc_krnls::polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.ptr, mag.step / sizeof(float), 
+        mathfunc_krnls::polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.ptr, mag.elem_step, 
-            angle.ptr, angle.step / sizeof(float), scale, x.ptr, x.step / sizeof(float), y.ptr, y.step / sizeof(float), mag.cols, mag.rows);
+            angle.ptr, angle.elem_step, scale, x.ptr, x.elem_step, y.ptr, y.elem_step, mag.cols, mag.rows);
        if (stream == 0)
            cudaSafeCall( cudaThreadSynchronize() );
@@ -210,3 +216,37 @@ namespace cv { namespace gpu { namespace mathfunc
        callers[mag.ptr == 0](mag, angle, x, y, angleInDegrees, stream);
    }
 }}}
+//////////////////////////////////////////////////////////////////////////////////////
+// Compare
+namespace mathfunc_krnls 
+{
+    template <typename T1, typename T2>
+    struct NotEqual
+    {
+        __device__ uchar operator()(const T1& src1, const T2& src2, int, int)
+        {
+            return static_cast<uchar>(static_cast<int>(src1 != src2) * 255);
+        }
+    };
+}
+namespace cv { namespace gpu { namespace mathfunc 
+{
+    template <typename T1, typename T2>
+    inline void compare_ne(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
+    {
+        mathfunc_krnls::NotEqual<T1, T2> op;
+        transform(static_cast< DevMem2D_<T1> >(src1), static_cast< DevMem2D_<T2> >(src2), dst, op, 0);
+    }
+    void compare_ne_8uc4(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
+    {
+        compare_ne<uint, uint>(src1, src2, dst);
+    }
+    void compare_ne_32f(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst)
+    {
+        compare_ne<float, float>(src1, src2, dst);
+    }
+}}}
--- a/modules/gpu/src/cuda/matrix_operations.cu
+++ b/modules/gpu/src/cuda/matrix_operations.cu
--- a/modules/gpu/src/cuda/saturate_cast.hpp
+++ b/modules/gpu/src/cuda/saturate_cast.hpp
--- a/modules/gpu/src/cuda/transform.hpp
+++ b/modules/gpu/src/cuda/transform.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_GPU_TRANSFORM_HPP__
+#define __OPENCV_GPU_TRANSFORM_HPP__
+#include "cuda_shared.hpp"
+#include "saturate_cast.hpp"
+#include "vecmath.hpp"
+namespace cv { namespace gpu { namespace algo_krnls
+{
+    template <typename T, typename D, typename UnOp>
+    static __global__ void transform(const T* src, size_t src_step, 
+                                     D* dst, size_t dst_step, int width, int height, UnOp op)
+    {
+		const int x = blockDim.x * blockIdx.x + threadIdx.x;
+		const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        if (x < width && y < height)
+        {
+            T src_data = src[y * src_step + x];
+            dst[y * dst_step + x] = op(src_data, x, y);
+        }
+    }
+    template <typename T1, typename T2, typename D, typename BinOp>
+    static __global__ void transform(const T1* src1, size_t src1_step, const T2* src2, size_t src2_step, 
+                                     D* dst, size_t dst_step, int width, int height, BinOp op)
+    {
+		const int x = blockDim.x * blockIdx.x + threadIdx.x;
+		const int y = blockDim.y * blockIdx.y + threadIdx.y;
+        if (x < width && y < height)
+        {
+            T1 src1_data = src1[y * src1_step + x];
+            T2 src2_data = src2[y * src2_step + x];
+            dst[y * dst_step + x] = op(src1_data, src2_data, x, y);
+        }
+    }
+}}}
+namespace cv 
+{ 
+    namespace gpu 
+    {
+        template <typename T, typename D, typename UnOp>
+        static void transform(const DevMem2D_<T>& src, const DevMem2D_<D>& dst, UnOp op, cudaStream_t stream)
+        {
+            dim3 threads(16, 16, 1);
+            dim3 grid(1, 1, 1);
+            grid.x = divUp(src.cols, threads.x);
+            grid.y = divUp(src.rows, threads.y);        
+            algo_krnls::transform<<<grid, threads, 0, stream>>>(src.ptr, src.elem_step, 
+                dst.ptr, dst.elem_step, src.cols, src.rows, op);
+            if (stream == 0)
+                cudaSafeCall( cudaThreadSynchronize() );
+        }
+        template <typename T1, typename T2, typename D, typename BinOp>
+        static void transform(const DevMem2D_<T1>& src1, const DevMem2D_<T2>& src2, const DevMem2D_<D>& dst, BinOp op, cudaStream_t stream)
+        {
+            dim3 threads(16, 16, 1);
+            dim3 grid(1, 1, 1);
+            grid.x = divUp(src1.cols, threads.x);
+            grid.y = divUp(src1.rows, threads.y);        
+            algo_krnls::transform<<<grid, threads, 0, stream>>>(src1.ptr, src1.elem_step, 
+                src2.ptr, src2.elem_step, dst.ptr, dst.elem_step, src1.cols, src1.rows, op);
+            if (stream == 0)
+                cudaSafeCall( cudaThreadSynchronize() );
+        }
+    }
+}
+#endif // __OPENCV_GPU_TRANSFORM_HPP__
--- a/modules/gpu/src/cuda/vecmath.hpp
+++ b/modules/gpu/src/cuda/vecmath.hpp
+/*M///////////////////////////////////////////////////////////////////////////////////////
+//
+//  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+//
+//  By downloading, copying, installing or using the software you agree to this license.
+//  If you do not agree to this license, do not download, install,
+//  copy or use the software.
+//
+//
+//                           License Agreement
+//                For Open Source Computer Vision Library
+//
+// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
+// Third party copyrights are property of their respective owners.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+//
+//   * Redistribution's of source code must retain the above copyright notice,
+//     this list of conditions and the following disclaimer.
+//
+//   * Redistribution's in binary form must reproduce the above copyright notice,
+//     this list of conditions and the following disclaimer in the documentation
+//     and/or other materials provided with the distribution.
+//
+//   * The name of the copyright holders may not be used to endorse or promote products
+//     derived from this software without specific prior written permission.
+//
+// This software is provided by the copyright holders and contributors "as is" and
+// any express or implied warranties, including, but not limited to, the implied
+// warranties of merchantability and fitness for a particular purpose are disclaimed.
+// In no event shall the Intel Corporation or contributors be liable for any direct,
+// indirect, incidental, special, exemplary, or consequential damages
+// (including, but not limited to, procurement of substitute goods or services;
+// loss of use, data, or profits; or business interruption) however caused
+// and on any theory of liability, whether in contract, strict liability,
+// or tort (including negligence or otherwise) arising in any way out of
+// the use of this software, even if advised of the possibility of such damage.
+//
+//M*/
+#ifndef __OPENCV_GPU_VECMATH_HPP__
+#define __OPENCV_GPU_VECMATH_HPP__
+#include "cuda_shared.hpp"
+namespace cv
+{
+    namespace gpu
+    {
+        template<typename T, int N> struct TypeVec;
+        template<typename T> struct TypeVec<T, 1> { typedef T vec_t; };
+        template<> struct TypeVec<unsigned char, 2> { typedef uchar2 vec_t; };
+        template<> struct TypeVec<uchar2, 2> { typedef uchar2 vec_t; };
+        template<> struct TypeVec<unsigned char, 3> { typedef uchar3 vec_t; };;
+        template<> struct TypeVec<uchar3, 3> { typedef uchar3 vec_t; };
+        template<> struct TypeVec<unsigned char, 4> { typedef uchar4 vec_t; };;
+        template<> struct TypeVec<uchar4, 4> { typedef uchar4 vec_t; };
+        template<> struct TypeVec<char, 2> { typedef char2 vec_t; };
+        template<> struct TypeVec<char2, 2> { typedef char2 vec_t; };
+        template<> struct TypeVec<char, 3> { typedef char3 vec_t; };
+        template<> struct TypeVec<char3, 3> { typedef char3 vec_t; };
+        template<> struct TypeVec<char, 4> { typedef char4 vec_t; };
+        template<> struct TypeVec<char4, 4> { typedef char4 vec_t; };
+        template<> struct TypeVec<unsigned short, 2> { typedef ushort2 vec_t; };
+        template<> struct TypeVec<ushort2, 2> { typedef ushort2 vec_t; };
+        template<> struct TypeVec<unsigned short, 3> { typedef ushort3 vec_t; };
+        template<> struct TypeVec<ushort3, 3> { typedef ushort3 vec_t; };
+        template<> struct TypeVec<unsigned short, 4> { typedef ushort4 vec_t; };
+        template<> struct TypeVec<ushort4, 4> { typedef ushort4 vec_t; };
+        template<> struct TypeVec<short, 2> { typedef short2 vec_t; };
+        template<> struct TypeVec<short2, 2> { typedef short2 vec_t; };
+        template<> struct TypeVec<short, 3> { typedef short3 vec_t; };
+        template<> struct TypeVec<short3, 3> { typedef short3 vec_t; };
+        template<> struct TypeVec<short, 4> { typedef short4 vec_t; };
+        template<> struct TypeVec<short4, 4> { typedef short4 vec_t; };
+        template<> struct TypeVec<unsigned int, 2> { typedef uint2 vec_t; };
+        template<> struct TypeVec<uint2, 2> { typedef uint2 vec_t; };
+        template<> struct TypeVec<unsigned int, 3> { typedef uint3 vec_t; };
+        template<> struct TypeVec<uint3, 3> { typedef uint3 vec_t; };
+        template<> struct TypeVec<unsigned int, 4> { typedef uint4 vec_t; };
+        template<> struct TypeVec<uint4, 4> { typedef uint4 vec_t; };
+        template<> struct TypeVec<int, 2> { typedef int2 vec_t; };
+        template<> struct TypeVec<int2, 2> { typedef int2 vec_t; };
+        template<> struct TypeVec<int, 3> { typedef int3 vec_t; };
+        template<> struct TypeVec<int3, 3> { typedef int3 vec_t; };
+        template<> struct TypeVec<int, 4> { typedef int4 vec_t; };
+        template<> struct TypeVec<int4, 4> { typedef int4 vec_t; };
+        template<> struct TypeVec<float, 2> { typedef float2 vec_t; };
+        template<> struct TypeVec<float2, 2> { typedef float2 vec_t; };
+        template<> struct TypeVec<float, 3> { typedef float3 vec_t; };
+        template<> struct TypeVec<float3, 3> { typedef float3 vec_t; };
+        template<> struct TypeVec<float, 4> { typedef float4 vec_t; };
+        template<> struct TypeVec<float4, 4> { typedef float4 vec_t; };        
+        static __device__ uchar4 operator+(const uchar4& a, const uchar4& b)
+        {
+            return make_uchar4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+        }
+        static __device__ uchar4 operator-(const uchar4& a, const uchar4& b)
+        {
+            return make_uchar4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+        }
+        static __device__ uchar4 operator*(const uchar4& a, const uchar4& b)
+        {
+            return make_uchar4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
+        }
+        static __device__ uchar4 operator/(const uchar4& a, const uchar4& b)
+        {
+            return make_uchar4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
+        }
+        template <typename T>
+        static __device__ uchar4 operator*(const uchar4& a, T s)
+        {
+            return make_uchar4(a.x * s, a.y * s, a.z * s, a.w * s);
+        }
+        template <typename T>
+        static __device__ uchar4 operator*(T s, const uchar4& a)
+        {
+            return a * s;
+        }
+    }
+}
+#endif // __OPENCV_GPU_VECMATH_HPP__
\ No newline at end of file
--- a/modules/gpu/src/cudastream.cpp
+++ b/modules/gpu/src/cudastream.cpp
@@ -69,6 +69,22 @@ void cv::gpu::Stream::enqueueConvert(const GpuMat& /*src*/, GpuMat& /*dst*/, int
 #include "opencv2/gpu/stream_accessor.hpp"
+namespace cv 
+{
+    namespace gpu
+    {
+        namespace matrix_operations
+        {            
+            void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
+            void set_to_without_mask (DevMem2D dst, int depth, const double *scalar, int channels, const cudaStream_t & stream = 0);
+            void set_to_with_mask    (DevMem2D dst, int depth, const double *scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
+            void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream = 0);
+        }
+    }
+}
 struct Stream::Impl
 {
    cudaStream_t stream;

--- a/modules/gpu/src/filtering_npp.cpp
+++ b/modules/gpu/src/filtering_npp.cpp
--- a/modules/gpu/src/imgproc_gpu.cpp
+++ b/modules/gpu/src/imgproc_gpu.cpp
@@ -75,7 +75,7 @@ void cv::gpu::histRange(const GpuMat&, GpuMat*, const GpuMat*) { throw_nogpu();
 namespace cv { namespace gpu 
 { 
-    namespace improc 
+    namespace imgproc 
    {
        void remap_gpu_1c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst);
        void remap_gpu_3c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst);
@@ -142,7 +142,7 @@ namespace cv { namespace gpu
 void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap)
 {
    typedef void (*remap_gpu_t)(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst);
-    static const remap_gpu_t callers[] = {improc::remap_gpu_1c, 0, improc::remap_gpu_3c};
+    static const remap_gpu_t callers[] = {imgproc::remap_gpu_1c, 0, imgproc::remap_gpu_3c};
    CV_Assert((src.type() == CV_8U || src.type() == CV_8UC3) && xmap.type() == CV_32F && ymap.type() == CV_32F);
@@ -180,7 +180,7 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
        eps = 1.f;
    eps = (float)std::max(criteria.epsilon, 0.0);        
-    improc::meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps);    
+    imgproc::meanShiftFiltering_gpu(src, dst, sp, sr, maxIter, eps);    
 }
 ////////////////////////////////////////////////////////////////////////
@@ -207,7 +207,7 @@ void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int
        eps = 1.f;
    eps = (float)std::max(criteria.epsilon, 0.0);        
-    improc::meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps);    
+    imgproc::meanShiftProc_gpu(src, dstr, dstsp, sp, sr, maxIter, eps);    
 }
 ////////////////////////////////////////////////////////////////////////
@@ -223,7 +223,7 @@ namespace
            out = dst;
        out.create(src.size(), CV_8UC4);
-        improc::drawColorDisp_gpu((DevMem2D_<T>)src, out, ndisp, stream);
+        imgproc::drawColorDisp_gpu((DevMem2D_<T>)src, out, ndisp, stream);
        dst = out;
    }
@@ -256,7 +256,7 @@ namespace
    void reprojectImageTo3D_caller(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const cudaStream_t& stream)
    {        
        xyzw.create(disp.rows, disp.cols, CV_32FC4);
-        improc::reprojectImageTo3D_gpu((DevMem2D_<T>)disp, xyzw, Q.ptr<float>(), stream);
+        imgproc::reprojectImageTo3D_gpu((DevMem2D_<T>)disp, xyzw, Q.ptr<float>(), stream);
    }
    typedef void (*reprojectImageTo3D_caller_t)(const GpuMat& disp, GpuMat& xyzw, const Mat& Q, const cudaStream_t& stream);
@@ -313,7 +313,7 @@ namespace
            case CV_RGBA2BGR: case CV_RGB2BGR: case CV_BGRA2RGBA:                
                {
                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, cudaStream_t stream);
-                    static const func_t funcs[] = {improc::RGB2RGB_gpu_8u, 0, improc::RGB2RGB_gpu_16u, 0, 0, improc::RGB2RGB_gpu_32f};
+                    static const func_t funcs[] = {imgproc::RGB2RGB_gpu_8u, 0, imgproc::RGB2RGB_gpu_16u, 0, 0, imgproc::RGB2RGB_gpu_32f};
                    CV_Assert(scn == 3 || scn == 4);
@@ -338,7 +338,7 @@ namespace
                    dst.create(sz, CV_8UC2);
-                    improc::RGB2RGB5x5_gpu(src, scn, dst, green_bits, bidx, stream);
+                    imgproc::RGB2RGB5x5_gpu(src, scn, dst, green_bits, bidx, stream);
                    break;
                }
@@ -356,14 +356,14 @@ namespace
                    dst.create(sz, CV_MAKETYPE(depth, dcn));
-                    improc::RGB5x52RGB_gpu(src, green_bits, dst, dcn, bidx, stream);
+                    imgproc::RGB5x52RGB_gpu(src, green_bits, dst, dcn, bidx, stream);
                    break;
                }
            case CV_BGR2GRAY: case CV_BGRA2GRAY: case CV_RGB2GRAY: case CV_RGBA2GRAY:
                {
                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int bidx, cudaStream_t stream);
-                    static const func_t funcs[] = {improc::RGB2Gray_gpu_8u, 0, improc::RGB2Gray_gpu_16u, 0, 0, improc::RGB2Gray_gpu_32f};
+                    static const func_t funcs[] = {imgproc::RGB2Gray_gpu_8u, 0, imgproc::RGB2Gray_gpu_16u, 0, 0, imgproc::RGB2Gray_gpu_32f};
                    CV_Assert(scn == 3 || scn == 4);
@@ -383,14 +383,14 @@ namespace
                    dst.create(sz, CV_8UC1);
-                    improc::RGB5x52Gray_gpu(src, green_bits, dst, stream);
+                    imgproc::RGB5x52Gray_gpu(src, green_bits, dst, stream);
                    break;
                }
            case CV_GRAY2BGR: case CV_GRAY2BGRA:
                {
                    typedef void (*func_t)(const DevMem2D& src, const DevMem2D& dst, int dstcn, cudaStream_t stream);
-                    static const func_t funcs[] = {improc::Gray2RGB_gpu_8u, 0, improc::Gray2RGB_gpu_16u, 0, 0, improc::Gray2RGB_gpu_32f};
+                    static const func_t funcs[] = {imgproc::Gray2RGB_gpu_8u, 0, imgproc::Gray2RGB_gpu_16u, 0, 0, imgproc::Gray2RGB_gpu_32f};
                    if (dcn <= 0) dcn = 3;
@@ -410,7 +410,7 @@ namespace
                    dst.create(sz, CV_8UC2);
-                    improc::Gray2RGB5x5_gpu(src, dst, green_bits, stream);
+                    imgproc::Gray2RGB5x5_gpu(src, dst, green_bits, stream);
                    break;
                }
@@ -419,7 +419,7 @@ namespace
                {
                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, 
                        const void* coeffs, cudaStream_t stream);
-                    static const func_t funcs[] = {improc::RGB2YCrCb_gpu_8u, 0, improc::RGB2YCrCb_gpu_16u, 0, 0, improc::RGB2YCrCb_gpu_32f};
+                    static const func_t funcs[] = {imgproc::RGB2YCrCb_gpu_8u, 0, imgproc::RGB2YCrCb_gpu_16u, 0, 0, imgproc::RGB2YCrCb_gpu_32f};
                    if (dcn <= 0) dcn = 3;
                    CV_Assert((scn == 3 || scn == 4) && (dcn == 3 || dcn == 4));
@@ -456,7 +456,7 @@ namespace
                {
                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, 
                        const void* coeffs, cudaStream_t stream);
-                    static const func_t funcs[] = {improc::YCrCb2RGB_gpu_8u, 0, improc::YCrCb2RGB_gpu_16u, 0, 0, improc::YCrCb2RGB_gpu_32f};
+                    static const func_t funcs[] = {imgproc::YCrCb2RGB_gpu_8u, 0, imgproc::YCrCb2RGB_gpu_16u, 0, 0, imgproc::YCrCb2RGB_gpu_32f};
                    if (dcn <= 0) dcn = 3;
@@ -485,7 +485,7 @@ namespace
                {
                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, 
                        const void* coeffs, cudaStream_t stream);
-                    static const func_t funcs[] = {improc::RGB2XYZ_gpu_8u, 0, improc::RGB2XYZ_gpu_16u, 0, 0, improc::RGB2XYZ_gpu_32f};
+                    static const func_t funcs[] = {imgproc::RGB2XYZ_gpu_8u, 0, imgproc::RGB2XYZ_gpu_16u, 0, 0, imgproc::RGB2XYZ_gpu_32f};
                    if (dcn <= 0) dcn = 3;
@@ -534,7 +534,7 @@ namespace
                {
                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, 
                        const void* coeffs, cudaStream_t stream);
-                    static const func_t funcs[] = {improc::XYZ2RGB_gpu_8u, 0, improc::XYZ2RGB_gpu_16u, 0, 0, improc::XYZ2RGB_gpu_32f};
+                    static const func_t funcs[] = {imgproc::XYZ2RGB_gpu_8u, 0, imgproc::XYZ2RGB_gpu_16u, 0, 0, imgproc::XYZ2RGB_gpu_32f};
                    if (dcn <= 0) dcn = 3;
@@ -584,8 +584,8 @@ namespace
                {
                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, 
                        int hrange, cudaStream_t stream);
-                    static const func_t funcs_hsv[] = {improc::RGB2HSV_gpu_8u, 0, 0, 0, 0, improc::RGB2HSV_gpu_32f};
+                    static const func_t funcs_hsv[] = {imgproc::RGB2HSV_gpu_8u, 0, 0, 0, 0, imgproc::RGB2HSV_gpu_32f};
-                    static const func_t funcs_hls[] = {improc::RGB2HLS_gpu_8u, 0, 0, 0, 0, improc::RGB2HLS_gpu_32f};
+                    static const func_t funcs_hls[] = {imgproc::RGB2HLS_gpu_8u, 0, 0, 0, 0, imgproc::RGB2HLS_gpu_32f};
                    if (dcn <= 0) dcn = 3;
@@ -610,8 +610,8 @@ namespace
                {
                    typedef void (*func_t)(const DevMem2D& src, int srccn, const DevMem2D& dst, int dstcn, int bidx, 
                        int hrange, cudaStream_t stream);
-                    static const func_t funcs_hsv[] = {improc::HSV2RGB_gpu_8u, 0, 0, 0, 0, improc::HSV2RGB_gpu_32f};
+                    static const func_t funcs_hsv[] = {imgproc::HSV2RGB_gpu_8u, 0, 0, 0, 0, imgproc::HSV2RGB_gpu_32f};
-                    static const func_t funcs_hls[] = {improc::HLS2RGB_gpu_8u, 0, 0, 0, 0, improc::HLS2RGB_gpu_32f};
+                    static const func_t funcs_hls[] = {imgproc::HLS2RGB_gpu_8u, 0, 0, 0, 0, imgproc::HLS2RGB_gpu_32f};
                    if (dcn <= 0) dcn = 3;

--- a/modules/gpu/src/matrix_operations.cpp
+++ b/modules/gpu/src/matrix_operations.cpp
@@ -77,6 +77,22 @@ namespace cv
 #else /* !defined (HAVE_CUDA) */
+namespace cv 
+{
+    namespace gpu
+    {
+        namespace matrix_operations
+        {            
+            void copy_to_with_mask(const DevMem2D& src, DevMem2D dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
+            void set_to_without_mask (DevMem2D dst, int depth, const double *scalar, int channels, const cudaStream_t & stream = 0);
+            void set_to_with_mask    (DevMem2D dst, int depth, const double *scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream = 0);
+            void convert_to(const DevMem2D& src, int sdepth, DevMem2D dst, int ddepth, int channels, double alpha, double beta, const cudaStream_t & stream = 0);
+        }
+    }
+}
 void cv::gpu::GpuMat::upload(const Mat& m)
 {
    CV_DbgAssert(!m.empty());

--- a/tests/gpu/src/gputest_main.cpp
+++ b/tests/gpu/src/gputest_main.cpp
@@ -53,7 +53,6 @@ const char* blacklist[] =
    //"GPU-NppImageMeanStdDev",       // different precision
    //"GPU-NppImageExp",              // different precision
    //"GPU-NppImageLog",              // different precision
-    //"GPU-NppImageMagnitude",        // different precision
    "GPU-NppImageCanny",            // NPP_TEXTURE_BIND_ERROR
    //"GPU-NppImageResize",           // different precision
@@ -61,8 +60,8 @@ const char* blacklist[] =
    //"GPU-NppImageWarpPerspective",  // different precision
    //"GPU-NppImageIntegral",         // different precision
-    //"GPU-NppImageSobel",            // ???
+    //"GPU-NppImageSobel",            // sign error
-    //"GPU-NppImageScharr",           // ???    
+    //"GPU-NppImageScharr",           // sign error    
    //"GPU-NppImageGaussianBlur",     // different precision 
    0
 };