implemented gpu::remap for all types

47d68f69 · Vladislav Vinogradov · 78542854 · 47d68f69 · 47d68f69 · 47d68f69
Commit 47d68f69 authored Aug 31, 2011 by Vladislav Vinogradov
12 changed files
--- a/modules/gpu/include/opencv2/gpu/devmem2d.hpp
+++ b/modules/gpu/include/opencv2/gpu/devmem2d.hpp
@@ -66,6 +66,9 @@ namespace cv
        template <typename T> struct DevMem2D_
        {            
+            typedef T elem_type;
+            typedef int index_type;
            int cols;
            int rows;
            T* data;
@@ -79,8 +82,7 @@ namespace cv
            template <typename U>            
            explicit DevMem2D_(const DevMem2D_<U>& d)
                : cols(d.cols), rows(d.rows), data((T*)d.data), step(d.step) {}
-            typedef T elem_type;
            enum { elem_size = sizeof(elem_type) };
            __CV_GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; }
@@ -89,6 +91,9 @@ namespace cv
 			__CV_GPU_HOST_DEVICE__ operator T*() const { return data; }
+            __CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; }
+            __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
 #if defined(__DEVCLASES_ADD_THRUST_BEGIN_END__)    
            thrust::device_ptr<T> begin() const { return thrust::device_ptr<T>(data); }
            thrust::device_ptr<T> end() const { return thrust::device_ptr<T>(data) + cols * rows; }
@@ -97,19 +102,24 @@ namespace cv
        template<typename T> struct PtrStep_
        {
+            typedef T elem_type;
+            typedef int index_type;
            T* data;
            size_t step;
            PtrStep_() : data(0), step(0) {}            
            PtrStep_(const DevMem2D_<T>& mem) : data(mem.data), step(mem.step) {}
-            typedef T elem_type;
            enum { elem_size = sizeof(elem_type) };
            __CV_GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; }
            __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return (T*)( (char*)data + y * step); }
            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)data + y * step); }
+            __CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; }
+            __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
 #if defined(__DEVCLASES_ADD_THRUST_BEGIN_END__)    
            thrust::device_ptr<T> begin() const { return thrust::device_ptr<T>(data); }
 #endif
@@ -124,7 +134,10 @@ namespace cv
                PtrStep_<T>::step /= PtrStep_<T>::elem_size;             
            }
            __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return PtrStep_<T>::data + y * PtrStep_<T>::step; }
-            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return PtrStep_<T>::data + y * PtrStep_<T>::step; }                    
+            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return PtrStep_<T>::data + y * PtrStep_<T>::step; }  
+            __CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; }
+            __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }                  
        };
        typedef DevMem2D_<unsigned char> DevMem2D;

--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -596,8 +596,9 @@ namespace cv
        ////////////////////////////// Image processing //////////////////////////////
        //! DST[x,y] = SRC[xmap[x,y],ymap[x,y]] with bilinear interpolation.
-        //! supports CV_8UC1, CV_8UC3 source types and CV_32FC1 map type
+        //! supports CV_32FC1 map type
-        CV_EXPORTS void remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap);
+        CV_EXPORTS void remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap,
+            int interpolation, int borderMode = BORDER_CONSTANT, const Scalar& borderValue = Scalar());
        //! Does mean shift filtering on GPU.
        CV_EXPORTS void meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
@@ -761,10 +762,10 @@ namespace cv
        CV_EXPORTS void upsample(const GpuMat& src, GpuMat &dst, Stream& stream = Stream::Null());
        //! smoothes the source image and downsamples it
-        CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+        CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
        //! upsamples the source image and then smoothes it
-        CV_EXPORTS void pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+        CV_EXPORTS void pyrUp(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
        //! performs linear blending of two images
        //! to avoid accuracy errors sum of weigths shouldn't be very close to zero

--- a/modules/gpu/src/cuda/filters.cu
+++ b/modules/gpu/src/cuda/filters.cu
@@ -242,9 +242,9 @@ namespace filter_krnls
        {
            const T* srcCol = src.ptr() + x;
-            sDataColumn[ threadIdx.y                    * BLOCK_DIM_X] = b.at_low(y - BLOCK_DIM_Y, srcCol);
+            sDataColumn[ threadIdx.y                    * BLOCK_DIM_X] = b.at_low(y - BLOCK_DIM_Y, srcCol, src.step);
-            sDataColumn[(threadIdx.y + BLOCK_DIM_Y)     * BLOCK_DIM_X] = b.at_high(y, srcCol);
+            sDataColumn[(threadIdx.y + BLOCK_DIM_Y)     * BLOCK_DIM_X] = b.at_high(y, srcCol, src.step);
-            sDataColumn[(threadIdx.y + BLOCK_DIM_Y * 2) * BLOCK_DIM_X] = b.at_high(y + BLOCK_DIM_Y, srcCol);
+            sDataColumn[(threadIdx.y + BLOCK_DIM_Y * 2) * BLOCK_DIM_X] = b.at_high(y + BLOCK_DIM_Y, srcCol, src.step);
            __syncthreads();
@@ -273,7 +273,7 @@ namespace cv { namespace gpu { namespace filters
        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
        dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
-        B<T> b(src.rows, src.step);
+        B<T> b(src.rows);
        if (!b.is_range_safe(-BLOCK_DIM_Y, (grid.y + 1) * BLOCK_DIM_Y - 1))
        {

--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -44,6 +44,8 @@
 #include "opencv2/gpu/device/border_interpolate.hpp"
 #include "opencv2/gpu/device/vec_traits.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/saturate_cast.hpp"
+#include "opencv2/gpu/device/utility.hpp"
 using namespace cv::gpu;
 using namespace cv::gpu::device;
@@ -51,109 +53,119 @@ using namespace cv::gpu::device;
 /////////////////////////////////// Remap ///////////////////////////////////////////////
 namespace cv { namespace gpu { namespace imgproc
 {
-    texture<unsigned char, 2, cudaReadModeNormalizedFloat> tex_remap(0, cudaFilterModeLinear, cudaAddressModeWrap);
+    // cudaAddressModeClamp == BrdReplicate
+    /*texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_remap_uchar_LinearFilter(0, cudaFilterModeLinear, cudaAddressModeClamp);
-    __global__ void remap_1c(const float* mapx, const float* mapy, size_t map_step, uchar* out, size_t out_step, int width, int height)
+    __global__ void remap_uchar_LinearFilter(const PtrStepf mapx, const PtrStepf mapy, DevMem2D dst)
    {    
-        int x = blockDim.x * blockIdx.x + threadIdx.x;
+        const int x = blockDim.x * blockIdx.x + threadIdx.x;
-        int y = blockDim.y * blockIdx.y + threadIdx.y;
+        const int y = blockDim.y * blockIdx.y + threadIdx.y;
-        if (x < width && y < height)
-        {
-            int idx = y * (map_step >> 2) + x; /* map_step >> 2  <=> map_step / sizeof(float)*/
-            float xcoo = mapx[idx];
+        if (x < dst.cols && y < dst.rows)
-            float ycoo = mapy[idx];
+        {
+            const float xcoo = mapx.ptr(y)[x];
+            const float ycoo = mapy.ptr(y)[x];
-            out[y * out_step + x] = (unsigned char)(255.f * tex2D(tex_remap, xcoo, ycoo));            
+            dst.ptr(y)[x] = 255.0f * tex2D(tex_remap_uchar_LinearFilter, xcoo, ycoo);            
        }
-    }
+    }*/
-    __global__ void remap_3c(const uchar* src, size_t src_step, const float* mapx, const float* mapy,
+    template <typename Ptr2D, typename T> __global__ void remap(const Ptr2D src, const PtrStepf mapx, const PtrStepf mapy, DevMem2D_<T> dst)
-                             size_t map_step, uchar* dst, size_t dst_step, int width, int height)
+    {
-    {    
        const int x = blockDim.x * blockIdx.x + threadIdx.x;
        const int y = blockDim.y * blockIdx.y + threadIdx.y;
-        if (x < width && y < height)
+        if (x < dst.cols && y < dst.rows)
        {
-            const int idx = y * (map_step >> 2) + x; /* map_step >> 2  <=> map_step / sizeof(float)*/
+            const float xcoo = mapx.ptr(y)[x];
+            const float ycoo = mapy.ptr(y)[x];
-            const float xcoo = mapx[idx];
-            const float ycoo = mapy[idx];
-            uchar3 out = make_uchar3(0, 0, 0);
-            if (xcoo >= 0 && xcoo < width - 1 && ycoo >= 0 && ycoo < height - 1)
+            dst.ptr(y)[x] = saturate_cast<T>(src(ycoo, xcoo));
-            {
-                const int x1 = __float2int_rd(xcoo);
-                const int y1 = __float2int_rd(ycoo);
-                const int x2 = x1 + 1;
-                const int y2 = y1 + 1;
-                uchar src_reg = *(src + y1 * src_step + 3 * x1);
-                out.x += src_reg * (x2 - xcoo) * (y2 - ycoo);
-                src_reg = *(src + y1 * src_step + 3 * x1 + 1);
-                out.y += src_reg * (x2 - xcoo) * (y2 - ycoo);
-                src_reg = *(src + y1 * src_step + 3 * x1 + 2);
-                out.z += src_reg * (x2 - xcoo) * (y2 - ycoo);
-                src_reg = *(src + y1 * src_step + 3 * x2);                
-                out.x += src_reg * (xcoo - x1) * (y2 - ycoo);
-                src_reg = *(src + y1 * src_step + 3 * x2 + 1); 
-                out.y += src_reg * (xcoo - x1) * (y2 - ycoo);
-                src_reg = *(src + y1 * src_step + 3 * x2 + 2); 
-                out.z += src_reg * (xcoo - x1) * (y2 - ycoo);
-                src_reg = *(src + y2 * src_step + 3 * x1);                
-                out.x += src_reg * (x2 - xcoo) * (ycoo - y1);
-                src_reg = *(src + y2 * src_step + 3 * x1 + 1); 
-                out.y += src_reg * (x2 - xcoo) * (ycoo - y1);
-                src_reg = *(src + y2 * src_step + 3 * x1 + 2); 
-                out.z += src_reg * (x2 - xcoo) * (ycoo - y1);
-                src_reg = *(src + y2 * src_step + 3 * x2);                
-                out.x += src_reg * (xcoo - x1) * (ycoo - y1);
-                src_reg = *(src + y2 * src_step + 3 * x2 + 1);  
-                out.y += src_reg * (xcoo - x1) * (ycoo - y1);
-                src_reg = *(src + y2 * src_step + 3 * x2 + 2);  
-                out.z += src_reg * (xcoo - x1) * (ycoo - y1);
-            }
-            /**(uchar3*)(dst + y * dst_step + 3 * x) = out;*/
-            *(dst + y * dst_step + 3 * x) = out.x;
-            *(dst + y * dst_step + 3 * x + 1) = out.y;
-            *(dst + y * dst_step + 3 * x + 2) = out.z;
        }
    }
-    void remap_gpu_1c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst)
+    template <template <typename> class Filter, template <typename> class B, typename T> 
+    void remap_caller(const DevMem2D_<T>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<T>& dst, T borderValue)
    {
-        dim3 threads(16, 16, 1);
+        dim3 block(32, 8);
-        dim3 grid(1, 1, 1);
+        dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-        grid.x = divUp(dst.cols, threads.x);
-        grid.y = divUp(dst.rows, threads.y);
-        TextureBinder tex(&tex_remap, src);
+        B<T> brd(src.rows, src.cols, borderValue);
+        BorderReader< PtrStep_<T>, B<T> > brd_src(src, brd);
+        Filter< BorderReader< PtrStep_<T>, B<T> > > filter_src(brd_src);
-        remap_1c<<<grid, threads>>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
+        remap<<<grid, block>>>(filter_src, mapx, mapy, dst);
        cudaSafeCall( cudaGetLastError() );
        cudaSafeCall( cudaDeviceSynchronize() );
    }
-    void remap_gpu_3c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst)
+#define OPENCV_GPU_IMPLEMENT_REMAP_TEX(type, filter) \
+    template <> void remap_caller<filter, BrdReplicate>(const DevMem2D_<type>& src, const DevMem2Df& mapx, const DevMem2Df& mapy, const DevMem2D_<type>& dst, type) \
+    { \
+        const dim3 block(16, 16); \
+        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y)); \
+        TextureBinder tex(&tex_remap_ ## type ## _ ## filter ## , src); \
+        remap_ ## type ## _ ## filter ## <<<grid, block>>>(mapx, mapy, dst); \
+        cudaSafeCall( cudaGetLastError() ); \
+        cudaSafeCall( cudaDeviceSynchronize() ); \
+    }
+    //OPENCV_GPU_IMPLEMENT_REMAP_TEX(uchar, LinearFilter)
+#undef OPENCV_GPU_IMPLEMENT_REMAP_TEX
+    template <typename T> void remap_gpu(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, 
+        int interpolation, int borderMode, const double borderValue[4])
    {
-        dim3 threads(32, 8, 1);
+        typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D_<T>& dst, T borderValue);
-        dim3 grid(1, 1, 1);
-        grid.x = divUp(dst.cols, threads.x);
-        grid.y = divUp(dst.rows, threads.y);
-        remap_3c<<<grid, threads>>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
+        static const caller_t callers[2][3] = 
-        cudaSafeCall( cudaGetLastError() );
+        {
+            { remap_caller<PointFilter, BrdReflect101>, remap_caller<PointFilter, BrdReplicate>, remap_caller<PointFilter, BrdConstant> },
+            { remap_caller<LinearFilter, BrdReflect101>, remap_caller<LinearFilter, BrdReplicate>, remap_caller<LinearFilter, BrdConstant> }
+        };
-        cudaSafeCall( cudaDeviceSynchronize() );
+        typename VecTraits<T>::elem_type brd[] = {(typename VecTraits<T>::elem_type)borderValue[0], (typename VecTraits<T>::elem_type)borderValue[1], (typename VecTraits<T>::elem_type)borderValue[2], (typename VecTraits<T>::elem_type)borderValue[3]};
+        callers[interpolation][borderMode](static_cast< DevMem2D_<T> >(src), xmap, ymap, static_cast< DevMem2D_<T> >(dst), VecTraits<T>::make(brd));
    }
+    template void remap_gpu<uchar >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<uchar2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<uchar3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<uchar4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<schar>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<char2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<char3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<char4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<ushort >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<ushort2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<ushort3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<ushort4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<short >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<short2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<short3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<short4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<uint >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<uint2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<uint3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<uint4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<int >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<int2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<int3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<int4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<float >(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<float2>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<float3>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
+    template void remap_gpu<float4>(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);
 /////////////////////////////////// MeanShiftfiltering ///////////////////////////////////////////////
    texture<uchar4, 2> tex_meanshift;
@@ -541,9 +553,9 @@ namespace cv { namespace gpu { namespace imgproc
        }
    }
-    template <typename B>
+    template <typename BR, typename BC>
    __global__ void cornerHarris_kernel(const int cols, const int rows, const int block_size, const float k,
-                                        PtrStep dst, B border_row, B border_col)
+                                        PtrStep dst, BR border_row, BC border_col)
    {
        const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
        const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -561,10 +573,10 @@ namespace cv { namespace gpu { namespace imgproc
            for (int i = ibegin; i < iend; ++i)
            {
-                int y = border_col.idx(i);
+                int y = border_col.idx_row(i);
                for (int j = jbegin; j < jend; ++j)
                {
-                    int x = border_row.idx(j);
+                    int x = border_row.idx_col(j);
                    float dx = tex2D(harrisDxTex, x, y);
                    float dy = tex2D(harrisDyTex, x, y);
                    a += dx * dx;
@@ -596,7 +608,7 @@ namespace cv { namespace gpu { namespace imgproc
        {
        case BORDER_REFLECT101_GPU:
            cornerHarris_kernel<<<grid, threads>>>(
-                    cols, rows, block_size, k, dst, BrdReflect101(cols), BrdReflect101(rows));
+                    cols, rows, block_size, k, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));
            break;
        case BORDER_REPLICATE_GPU:
            harrisDxTex.addressMode[0] = cudaAddressModeClamp;
@@ -656,9 +668,9 @@ namespace cv { namespace gpu { namespace imgproc
    }
-    template <typename B>
+    template <typename BR, typename BC>
    __global__ void cornerMinEigenVal_kernel(const int cols, const int rows, const int block_size, 
-                                             PtrStep dst, B border_row, B border_col)
+                                             PtrStep dst, BR border_row, BC border_col)
    {
        const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
        const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -676,10 +688,10 @@ namespace cv { namespace gpu { namespace imgproc
            for (int i = ibegin; i < iend; ++i)
            {
-                int y = border_col.idx(i);
+                int y = border_col.idx_row(i);
                for (int j = jbegin; j < jend; ++j)
                {
-                    int x = border_row.idx(j);
+                    int x = border_row.idx_col(j);
                    float dx = tex2D(minEigenValDxTex, x, y);
                    float dy = tex2D(minEigenValDyTex, x, y);
                    a += dx * dx;
@@ -713,7 +725,7 @@ namespace cv { namespace gpu { namespace imgproc
        {
        case BORDER_REFLECT101_GPU:
            cornerMinEigenVal_kernel<<<grid, threads>>>(
-                    cols, rows, block_size, dst, BrdReflect101(cols), BrdReflect101(rows));
+                    cols, rows, block_size, dst, BrdRowReflect101<void>(cols), BrdColReflect101<void>(rows));
            break;
        case BORDER_REPLICATE_GPU:
            minEigenValDxTex.addressMode[0] = cudaAddressModeClamp;
@@ -981,7 +993,7 @@ namespace cv { namespace gpu { namespace imgproc
    //////////////////////////////////////////////////////////////////////////
    // pyrDown
-    template <typename T> __global__ void pyrDown(const PtrStep_<T> src, PtrStep_<T> dst, const BrdReflect101 rowBrd, const BrdReflect101 colBrd, int dst_cols)
+    template <typename T, typename B> __global__ void pyrDown(const PtrStep_<T> src, PtrStep_<T> dst, const B b, int dst_cols)
    {
        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
@@ -996,11 +1008,11 @@ namespace cv { namespace gpu { namespace imgproc
        sum = VecTraits<value_type>::all(0);
-        sum = sum + 0.0625f * src.ptr(colBrd.idx(src_y - 2))[rowBrd.idx(x)];
+        sum = sum + 0.0625f * b.at(src_y - 2, x, src.data, src.step);
-        sum = sum + 0.25f   * src.ptr(colBrd.idx(src_y - 1))[rowBrd.idx(x)];
+        sum = sum + 0.25f   * b.at(src_y - 1, x, src.data, src.step);
-        sum = sum + 0.375f  * src.ptr(colBrd.idx(src_y    ))[rowBrd.idx(x)];
+        sum = sum + 0.375f  * b.at(src_y    , x, src.data, src.step);
-        sum = sum + 0.25f   * src.ptr(colBrd.idx(src_y + 1))[rowBrd.idx(x)];
+        sum = sum + 0.25f   * b.at(src_y + 1, x, src.data, src.step);
-        sum = sum + 0.0625f * src.ptr(colBrd.idx(src_y + 2))[rowBrd.idx(x)];
+        sum = sum + 0.0625f * b.at(src_y + 2, x, src.data, src.step);
        smem[2 + threadIdx.x] = sum;
@@ -1010,11 +1022,11 @@ namespace cv { namespace gpu { namespace imgproc
            sum = VecTraits<value_type>::all(0);
-            sum = sum + 0.0625f * src.ptr(colBrd.idx(src_y - 2))[rowBrd.idx(left_x)];
+            sum = sum + 0.0625f * b.at(src_y - 2, left_x, src.data, src.step);
-            sum = sum + 0.25f   * src.ptr(colBrd.idx(src_y - 1))[rowBrd.idx(left_x)];
+            sum = sum + 0.25f   * b.at(src_y - 1, left_x, src.data, src.step);
-            sum = sum + 0.375f  * src.ptr(colBrd.idx(src_y    ))[rowBrd.idx(left_x)];
+            sum = sum + 0.375f  * b.at(src_y    , left_x, src.data, src.step);
-            sum = sum + 0.25f   * src.ptr(colBrd.idx(src_y + 1))[rowBrd.idx(left_x)];
+            sum = sum + 0.25f   * b.at(src_y + 1, left_x, src.data, src.step);
-            sum = sum + 0.0625f * src.ptr(colBrd.idx(src_y + 2))[rowBrd.idx(left_x)];
+            sum = sum + 0.0625f * b.at(src_y + 2, left_x, src.data, src.step);
            smem[threadIdx.x] = sum;
        }
@@ -1025,11 +1037,11 @@ namespace cv { namespace gpu { namespace imgproc
            sum = VecTraits<value_type>::all(0);
-            sum = sum + 0.0625f * src.ptr(colBrd.idx(src_y - 2))[rowBrd.idx(right_x)];
+            sum = sum + 0.0625f * b.at(src_y - 2, right_x, src.data, src.step);
-            sum = sum + 0.25f   * src.ptr(colBrd.idx(src_y - 1))[rowBrd.idx(right_x)];
+            sum = sum + 0.25f   * b.at(src_y - 1, right_x, src.data, src.step);
-            sum = sum + 0.375f  * src.ptr(colBrd.idx(src_y    ))[rowBrd.idx(right_x)];
+            sum = sum + 0.375f  * b.at(src_y    , right_x, src.data, src.step);
-            sum = sum + 0.25f   * src.ptr(colBrd.idx(src_y + 1))[rowBrd.idx(right_x)];
+            sum = sum + 0.25f   * b.at(src_y + 1, right_x, src.data, src.step);
-            sum = sum + 0.0625f * src.ptr(colBrd.idx(src_y + 2))[rowBrd.idx(right_x)];
+            sum = sum + 0.0625f * b.at(src_y + 2, right_x, src.data, src.step);
            smem[4 + threadIdx.x] = sum;
        }
@@ -1055,58 +1067,68 @@ namespace cv { namespace gpu { namespace imgproc
        }
    }
-    template <typename T, int cn> void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
+    template <typename T, template <typename> class B> void pyrDown_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
    {
        const dim3 block(256);
        const dim3 grid(divUp(src.cols, block.x), dst.rows);
-        BrdReflect101 rowBrd(src.cols);
+        B<T> b(src.rows, src.cols);
-        BrdReflect101 colBrd(src.rows);
-        pyrDown<typename TypeVec<T, cn>::vec_type><<<grid, block, 0, stream>>>(
+        pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
-            static_cast< DevMem2D_<typename TypeVec<T, cn>::vec_type> >(src), 
-            static_cast< DevMem2D_<typename TypeVec<T, cn>::vec_type> >(dst), 
-            rowBrd, colBrd, dst.cols);
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
    }
-    template void pyrDown_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template <typename T, int cn> void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream)
-    template void pyrDown_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    {
-    template void pyrDown_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+        typedef typename TypeVec<T, cn>::vec_type type;
-    template void pyrDown_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+        typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
+        static const caller_t callers[] = 
+        {
+            pyrDown_caller<type, BrdReflect101>, pyrDown_caller<type, BrdReplicate>, pyrDown_caller<type, BrdConstant>
+        };
+        callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
+    }
+    template void pyrDown_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrDown_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrDown_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrDown_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrDown_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrDown_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrDown_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrDown_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrDown_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrDown_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrDown_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrDown_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrDown_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrDown_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrDown_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrDown_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrDown_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrDown_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrDown_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrDown_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrDown_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrDown_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrDown_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
    //////////////////////////////////////////////////////////////////////////
    // pyrUp
-    template <typename T> __global__ void pyrUp(const PtrStep_<T> src, DevMem2D_<T> dst, const BrdReflect101 rowBrd, const BrdReflect101 colBrd)
+    template <typename T, typename B> __global__ void pyrUp(const PtrStep_<T> src, DevMem2D_<T> dst, const B b)
    {
        typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type value_type;
@@ -1119,7 +1141,7 @@ namespace cv { namespace gpu { namespace imgproc
        value_type sum;
        if (threadIdx.x < 10 && threadIdx.y < 10)
-            smem1[threadIdx.y][threadIdx.x] = src.ptr(colBrd.idx(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1))[rowBrd.idx(blockIdx.x * blockDim.x / 2 + threadIdx.x - 1)];
+            smem1[threadIdx.y][threadIdx.x] = b.at(blockIdx.y * blockDim.y / 2 + threadIdx.y - 1, blockIdx.x * blockDim.x / 2 + threadIdx.x - 1, src.data, src.step);
        __syncthreads();
@@ -1175,53 +1197,63 @@ namespace cv { namespace gpu { namespace imgproc
            dst.ptr(y)[x] = saturate_cast<T>(4.0f * sum);
    }
-    template <typename T, int cn> void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream)
+    template <typename T, template <typename> class B> void pyrUp_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst, cudaStream_t stream)
    {
        const dim3 block(16, 16);
        const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
-        BrdReflect101 rowBrd(src.cols);
+        B<T> b(src.rows, src.cols);
-        BrdReflect101 colBrd(src.rows);
-        pyrUp<typename TypeVec<T, cn>::vec_type><<<grid, block, 0, stream>>>(
+        pyrUp<T><<<grid, block, 0, stream>>>(src, dst, b);
-            static_cast< DevMem2D_<typename TypeVec<T, cn>::vec_type> >(src), 
-            static_cast< DevMem2D_<typename TypeVec<T, cn>::vec_type> >(dst), 
-            rowBrd, colBrd);
        cudaSafeCall( cudaGetLastError() );
        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
    }
-    template void pyrUp_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template <typename T, int cn> void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream)
-    template void pyrUp_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    {
-    template void pyrUp_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+        typedef typename TypeVec<T, cn>::vec_type type;
-    template void pyrUp_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+        typedef void (*caller_t)(const DevMem2D_<type>& src, const DevMem2D_<type>& dst, cudaStream_t stream);
-    template void pyrUp_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
-    template void pyrUp_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+        static const caller_t callers[] = 
-    template void pyrUp_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+        {
-    template void pyrUp_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+            pyrUp_caller<type, BrdReflect101>, pyrUp_caller<type, BrdReplicate>, pyrUp_caller<type, BrdConstant>
+        };
-    template void pyrUp_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
-    template void pyrUp_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+        callers[borderType](static_cast< DevMem2D_<type> >(src), static_cast< DevMem2D_<type> >(dst), stream);
-    template void pyrUp_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    }
-    template void pyrUp_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrUp_gpu<uchar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrUp_gpu<uchar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrUp_gpu<uchar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrUp_gpu<uchar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrUp_gpu<schar, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrUp_gpu<schar, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrUp_gpu<schar, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrUp_gpu<schar, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrUp_gpu<ushort, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrUp_gpu<ushort, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrUp_gpu<ushort, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrUp_gpu<ushort, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
-    template void pyrUp_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template void pyrUp_gpu<short, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<short, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<short, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<short, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<int, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<int, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<int, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<int, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<float, 1>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<float, 2>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<float, 3>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
+    template void pyrUp_gpu<float, 4>(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
    //////////////////////////////////////////////////////////////////////////
    // buildWarpMaps

--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -675,31 +675,29 @@ namespace cv { namespace gpu { namespace surf
        3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
    };
-    __device__ __forceinline__ unsigned char calcWin(int i, int j, float centerX, float centerY, float win_offset, float cos_dir, float sin_dir)
+    struct WinReader
    {
-        float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
+        typedef uchar elem_type;
-        float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
-        return tex2D(imgTex, pixel_x, pixel_y);
+        __device__ __forceinline__ WinReader(float centerX_, float centerY_, float win_offset_, float cos_dir_, float sin_dir_) : 
-    }
+            centerX(centerX_), centerY(centerY_), win_offset(win_offset_), cos_dir(cos_dir_), sin_dir(sin_dir_)
+        {
-    __device__ unsigned char calcPATCH(int i1, int j1, float centerX, float centerY, float win_offset, float cos_dir, float sin_dir, int win_size)
+        }
-    {
-        /* Scale the window to size PATCH_SZ so each pixel's size is s. This
-           makes calculating the gradients with wavelets of size 2s easy */
-        const float icoo = ((float)i1 / (PATCH_SZ + 1)) * win_size;
-        const float jcoo = ((float)j1 / (PATCH_SZ + 1)) * win_size;
-        const int i = __float2int_rd(icoo);
+        __device__ __forceinline__ uchar operator ()(int i, int j) const
-        const int j = __float2int_rd(jcoo);
+        {
+            float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
+            float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
-        float res = calcWin(i, j, centerX, centerY, win_offset, cos_dir, sin_dir) * (i + 1 - icoo) * (j + 1 - jcoo);
+            return tex2D(imgTex, pixel_x, pixel_y);
-        res += calcWin(i + 1, j, centerX, centerY, win_offset, cos_dir, sin_dir) * (icoo - i) * (j + 1 - jcoo);
+        }
-        res += calcWin(i + 1, j + 1, centerX, centerY, win_offset, cos_dir, sin_dir) * (icoo - i) * (jcoo - j);
-        res += calcWin(i, j + 1, centerX, centerY, win_offset, cos_dir, sin_dir) * (i + 1 - icoo) * (jcoo - j);
-        return saturate_cast<unsigned char>(res);
+        float centerX; 
-    }  
+        float centerY;
+        float win_offset; 
+        float cos_dir; 
+        float sin_dir;
+    };
    __device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25], 
        const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
@@ -732,7 +730,13 @@ namespace cv { namespace gpu { namespace surf
        const int xIndex = xBlock * 5 + threadIdx.x;
        const int yIndex = yBlock * 5 + threadIdx.y;
-        s_PATCH[threadIdx.y][threadIdx.x] = calcPATCH(yIndex, xIndex, centerX, centerY, win_offset, cos_dir, sin_dir, win_size);
+        const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
+        const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
+        LinearFilter<WinReader> filter(WinReader(centerX, centerY, win_offset, cos_dir, sin_dir));
+        s_PATCH[threadIdx.y][threadIdx.x] = filter(icoo, jcoo);
        __syncthreads();
        if (threadIdx.x < 5 && threadIdx.y < 5)

--- a/modules/gpu/src/gpumat.cpp
+++ b/modules/gpu/src/gpumat.cpp
@@ -885,7 +885,7 @@ void cv::gpu::GpuMat::release()
    if( refcount && CV_XADD(refcount, -1) == 1 )
    {
        fastFree(refcount);
-        cudaSafeCall( cudaFree(datastart) );
+        cudaFree(datastart);
    }
    data = datastart = dataend = 0;
    step = rows = cols = 0;

--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
@@ -47,7 +47,7 @@ using namespace cv::gpu;
 #if !defined (HAVE_CUDA)
-void cv::gpu::remap(const GpuMat&, GpuMat&, const GpuMat&, const GpuMat&){ throw_nogpu(); }
+void cv::gpu::remap(const GpuMat&, GpuMat&, const GpuMat&, const GpuMat&, int, int, const Scalar&){ throw_nogpu(); }
 void cv::gpu::meanShiftFiltering(const GpuMat&, GpuMat&, int, int, TermCriteria) { throw_nogpu(); }
 void cv::gpu::meanShiftProc(const GpuMat&, GpuMat&, GpuMat&, int, int, TermCriteria) { throw_nogpu(); }
 void cv::gpu::drawColorDisp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
@@ -92,8 +92,8 @@ void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_nogp
 void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&) { throw_nogpu(); }
 void cv::gpu::downsample(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
 void cv::gpu::upsample(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
-void cv::gpu::pyrDown(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::pyrDown(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
-void cv::gpu::pyrUp(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::pyrUp(const GpuMat&, GpuMat&, int, Stream&) { throw_nogpu(); }
 void cv::gpu::Canny(const GpuMat&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
 void cv::gpu::Canny(const GpuMat&, CannyBuf&, GpuMat&, double, double, int, bool) { throw_nogpu(); }
 void cv::gpu::Canny(const GpuMat&, const GpuMat&, GpuMat&, double, double, bool) { throw_nogpu(); }
@@ -104,39 +104,52 @@ void cv::gpu::CannyBuf::release() { throw_nogpu(); }
 #else /* !defined (HAVE_CUDA) */
+////////////////////////////////////////////////////////////////////////
+// remap
 namespace cv { namespace gpu {  namespace imgproc
 {
-    void remap_gpu_1c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst);
+    template <typename T> void remap_gpu(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, 
-    void remap_gpu_3c(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst);
+                                         int interpolation, int borderMode, const double borderValue[4]);
+}}}
-    extern "C" void meanShiftFiltering_gpu(const DevMem2D& src, DevMem2D dst, int sp, int sr, int maxIter, float eps);
-    extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps);
-    void drawColorDisp_gpu(const DevMem2D& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream);
+void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap, int interpolation, int borderMode, const Scalar& borderValue)
-    void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream);
+{
+    using namespace cv::gpu::imgproc;
-    void reprojectImageTo3D_gpu(const DevMem2D& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
+    typedef void (*caller_t)(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, const DevMem2D& dst, int interpolation, int borderMode, const double borderValue[4]);;
-    void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
+    static const caller_t callers[6][4] = 
-}}}
+    {
+        {remap_gpu<uchar>, remap_gpu<uchar2>, remap_gpu<uchar3>, remap_gpu<uchar4>},
+        {remap_gpu<schar>, remap_gpu<char2>, remap_gpu<char3>, remap_gpu<char4>},
+        {remap_gpu<ushort>, remap_gpu<ushort2>, remap_gpu<ushort3>, remap_gpu<ushort4>},
+        {remap_gpu<short>, remap_gpu<short2>, remap_gpu<short3>, remap_gpu<short4>},
+        {remap_gpu<int>, remap_gpu<int2>, remap_gpu<int3>, remap_gpu<int4>},
+        {remap_gpu<float>, remap_gpu<float2>, remap_gpu<float3>, remap_gpu<float4>}
+    };
-////////////////////////////////////////////////////////////////////////
+    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
-// remap
+    CV_Assert(xmap.type() == CV_32F && ymap.type() == CV_32F && xmap.size() == ymap.size());
-void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap)
+    CV_Assert(interpolation == INTER_NEAREST || interpolation == INTER_LINEAR);
-{
-    typedef void (*remap_gpu_t)(const DevMem2D& src, const DevMem2Df& xmap, const DevMem2Df& ymap, DevMem2D dst);
-    static const remap_gpu_t callers[] = {imgproc::remap_gpu_1c, 0, imgproc::remap_gpu_3c};
-    CV_Assert((src.type() == CV_8U || src.type() == CV_8UC3) && xmap.type() == CV_32F && ymap.type() == CV_32F);
+    CV_Assert(borderMode == BORDER_REFLECT101 || borderMode == BORDER_REPLICATE || borderMode == BORDER_CONSTANT);
+    int gpuBorderType;
+    CV_Assert(tryConvertToGpuBorderType(borderMode, gpuBorderType));
    dst.create(xmap.size(), src.type());
-    callers[src.channels() - 1](src, xmap, ymap, dst);
+    callers[src.depth()][src.channels() - 1](src, xmap, ymap, dst, interpolation, gpuBorderType, borderValue.val);
 }
 ////////////////////////////////////////////////////////////////////////
 // meanShiftFiltering_GPU
+namespace cv { namespace gpu {  namespace imgproc
+{
+    extern "C" void meanShiftFiltering_gpu(const DevMem2D& src, DevMem2D dst, int sp, int sr, int maxIter, float eps);
+}}}
 void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr, TermCriteria criteria)
 {
    if( src.empty() )
@@ -163,6 +176,11 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
 ////////////////////////////////////////////////////////////////////////
 // meanShiftProc_GPU
+namespace cv { namespace gpu {  namespace imgproc
+{
+    extern "C" void meanShiftProc_gpu(const DevMem2D& src, DevMem2D dstr, DevMem2D dstsp, int sp, int sr, int maxIter, float eps);
+}}}
 void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int sp, int sr, TermCriteria criteria)
 {
    if( src.empty() )
@@ -190,6 +208,12 @@ void cv::gpu::meanShiftProc(const GpuMat& src, GpuMat& dstr, GpuMat& dstsp, int
 ////////////////////////////////////////////////////////////////////////
 // drawColorDisp
+namespace cv { namespace gpu {  namespace imgproc
+{
+    void drawColorDisp_gpu(const DevMem2D& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream);
+    void drawColorDisp_gpu(const DevMem2D_<short>& src, const DevMem2D& dst, int ndisp, const cudaStream_t& stream);
+}}}
 namespace
 {
    template <typename T>
@@ -215,6 +239,12 @@ void cv::gpu::drawColorDisp(const GpuMat& src, GpuMat& dst, int ndisp, Stream& s
 ////////////////////////////////////////////////////////////////////////
 // reprojectImageTo3D
+namespace cv { namespace gpu {  namespace imgproc
+{
+    void reprojectImageTo3D_gpu(const DevMem2D& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
+    void reprojectImageTo3D_gpu(const DevMem2D_<short>& disp, const DevMem2Df& xyzw, const float* q, const cudaStream_t& stream);
+}}}
 namespace
 {
    template <typename T>
@@ -1596,14 +1626,14 @@ void cv::gpu::upsample(const GpuMat& src, GpuMat& dst, Stream& stream)
 namespace cv { namespace gpu { namespace imgproc
 {
-    template <typename T, int cn> void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template <typename T, int cn> void pyrDown_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
 }}}
-void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream)
+void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, int borderType, Stream& stream)
 {
    using namespace cv::gpu::imgproc;
-    typedef void (*func_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    typedef void (*func_t)(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
    static const func_t funcs[6][4] = 
    {
@@ -1617,9 +1647,13 @@ void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream)
    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
+    CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT);
+    int gpuBorderType;
+    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
    dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
-    funcs[src.depth()][src.channels() - 1](src, dst, StreamAccessor::getStream(stream));
+    funcs[src.depth()][src.channels() - 1](src, dst, gpuBorderType, StreamAccessor::getStream(stream));
 }
@@ -1628,14 +1662,14 @@ void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream)
 namespace cv { namespace gpu { namespace imgproc
 {
-    template <typename T, int cn> void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    template <typename T, int cn> void pyrUp_gpu(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
 }}}
-void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream)
+void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, int borderType, Stream& stream)
 {
    using namespace cv::gpu::imgproc;
-    typedef void (*func_t)(const DevMem2D& src, const DevMem2D& dst, cudaStream_t stream);
+    typedef void (*func_t)(const DevMem2D& src, const DevMem2D& dst, int borderType, cudaStream_t stream);
    static const func_t funcs[6][4] = 
    {
@@ -1649,9 +1683,13 @@ void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream)
    CV_Assert(src.depth() <= CV_32F && src.channels() <= 4);
+    CV_Assert(borderType == BORDER_REFLECT101 || borderType == BORDER_REPLICATE || borderType == BORDER_CONSTANT);
+    int gpuBorderType;
+    CV_Assert(tryConvertToGpuBorderType(borderType, gpuBorderType));
    dst.create(src.rows*2, src.cols*2, src.type());
-    funcs[src.depth()][src.channels() - 1](src, dst, StreamAccessor::getStream(stream));
+    funcs[src.depth()][src.channels() - 1](src, dst, gpuBorderType, StreamAccessor::getStream(stream));
 }

--- a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
@@ -48,82 +48,131 @@
 namespace cv { namespace gpu { namespace device
 {
-    struct BrdReflect101 
+    //////////////////////////////////////////////////////////////
+    // BrdConstant
+    template <typename D> struct BrdRowConstant
    {
-        explicit __host__ __device__ __forceinline__ BrdReflect101(int len): last(len - 1) {}
+        typedef D result_type;
+        explicit __host__ __device__ __forceinline__ BrdRowConstant(int width_, const D& val_ = VecTraits<D>::all(0)) : width(width_), val(val_) {}
-        __device__ __forceinline__ int idx_low(int i) const
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const 
        {
-            return abs(i);
+            return x >= 0 ? saturate_cast<D>(data[x]) : val;
        }
-        __device__ __forceinline__ int idx_high(int i) const 
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const 
        {
-            return last - abs(last - i);
+            return x < width ? saturate_cast<D>(data[x]) : val;
        }
-        __device__ __forceinline__ int idx(int i) const
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const 
        {
-            return idx_low(idx_high(i));
+            return (x >= 0 && x < width) ? saturate_cast<D>(data[x]) : val;
        }
        __host__ __device__ __forceinline__ bool is_range_safe(int mini, int maxi) const 
        {
-            return -last <= mini && maxi <= 2 * last;
+            return true;
        }
-        const int last;
+        const int width;
+        const D val;
    };
-    template <typename D> struct BrdRowReflect101 : BrdReflect101
+    template <typename D> struct BrdColConstant
    {
-        explicit __host__ __device__ __forceinline__ BrdRowReflect101(int len): BrdReflect101(len) {}
+        typedef D result_type;
-        template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const 
+        explicit __host__ __device__ __forceinline__ BrdColConstant(int height_, const D& val_ = VecTraits<D>::all(0)) : height(height_), val(val_) {}
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const 
        {
-            return saturate_cast<D>(data[idx_low(i)]);
+            return y >= 0 ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
        }
-        template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const 
        {
-            return saturate_cast<D>(data[idx_high(i)]);
+            return y < height ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
        }
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const 
+        {
+            return (y >= 0 && y < height) ? saturate_cast<D>(*(const T*)((const char*)data + y * step)) : val;
+        }
+        __host__ __device__ __forceinline__ bool is_range_safe(int mini, int maxi) const 
+        {
+            return true;
+        }
+        const int height;
+        const D val;
    };
-    template <typename D> struct BrdColReflect101 : BrdReflect101
+    template <typename D> struct BrdConstant
    {
-        __host__ __device__ __forceinline__ BrdColReflect101(int len, size_t step): BrdReflect101(len), step(step) {}
+        typedef D result_type;
-        template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const 
+        __host__ __device__ __forceinline__ BrdConstant(int height_, int width_, const D& val_ = VecTraits<D>::all(0)) : 
+            height(height_), width(width_), val(val_) 
        {
-            return saturate_cast<D>(*(const D*)((const char*)data + idx_low(i)*step));
        }
-        template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const
        {
-            return saturate_cast<D>(*(const D*)((const char*)data + idx_high(i)*step));
+            return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(((const T*)((const uchar*)data + y * step))[x]) : val;
        }
-        const size_t step;
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const
+        {
+            return (x >= 0 && x < width && y >= 0 && y < height) ? saturate_cast<D>(src(y, x)) : val;
+        }
+        const int height;
+        const int width;
+        const D val;
    };
-    struct BrdReplicate
+    //////////////////////////////////////////////////////////////
+    // BrdReplicate
+    template <typename D> struct BrdRowReplicate
    {
-        explicit __host__ __device__ __forceinline__ BrdReplicate(int len): last(len - 1) {}
+        typedef D result_type;
+        explicit __host__ __device__ __forceinline__ BrdRowReplicate(int width) : last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowReplicate(int width, U) : last_col(width - 1) {}
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::max(x, 0);
+        }
+        __device__ __forceinline__ int idx_col_high(int x) const 
+        {
+            return ::min(x, last_col);
+        }
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
-        __device__ __forceinline__ int idx_low(int i) const
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const 
        {
-            return ::max(i, 0);
+            return saturate_cast<D>(data[idx_col_low(x)]);
        }
-        __device__ __forceinline__ int idx_high(int i) const 
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const 
        {
-            return ::min(i, last);
+            return saturate_cast<D>(data[idx_col_high(x)]);
        }
-        __device__ __forceinline__ int idx(int i) const
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const 
        {
-            return idx_low(idx_high(i));
+            return saturate_cast<D>(data[idx_col(x)]);
        }
        bool is_range_safe(int mini, int maxi) const 
@@ -131,103 +180,328 @@ namespace cv { namespace gpu { namespace device
            return true;
        }
-        const int last;
+        const int last_col;
    };
-    template <typename D> struct BrdRowReplicate : BrdReplicate
+    template <typename D> struct BrdColReplicate
    {
-        explicit __host__ __device__ __forceinline__ BrdRowReplicate(int len): BrdReplicate(len) {}
+        typedef D result_type;
-        template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const 
+        explicit __host__ __device__ __forceinline__ BrdColReplicate(int height) : last_row(height - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColReplicate(int height, U) : last_row(height - 1) {}
+        __device__ __forceinline__ int idx_row_low(int y) const
        {
-            return saturate_cast<D>(data[idx_low(i)]);
+            return ::max(y, 0);
        }
-        template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const 
+        __device__ __forceinline__ int idx_row_high(int y) const 
        {
-            return saturate_cast<D>(data[idx_high(i)]);
+            return ::min(y, last_row);
+        }
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const T*)((const char*)data + idx_row_low(y) * step));
+        }
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const T*)((const char*)data + idx_row_high(y) * step));
        }
-    };
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const T*)((const char*)data + idx_row(y) * step));
+        }
-    template <typename D> struct BrdColReplicate : BrdReplicate
+        bool is_range_safe(int mini, int maxi) const 
+        {
+            return true;
+        }
+        const int last_row;
+    };
+    template <typename D> struct BrdReplicate
    {
-        __host__ __device__ __forceinline__ BrdColReplicate(int len, size_t step): BrdReplicate(len), step(step) {}
+        typedef D result_type;
+        __host__ __device__ __forceinline__ BrdReplicate(int height, int width) : 
+            last_row(height - 1), last_col(width - 1) 
+        {
+        }
+        template <typename U> 
+        __host__ __device__ __forceinline__ BrdReplicate(int height, int width, U) : 
+            last_row(height - 1), last_col(width - 1) 
+        {
+        }
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::max(y, 0);
+        }
+        __device__ __forceinline__ float idx_row_low(float y) const
+        {
+            return ::fmax(y, 0.0f);
+        }
+        __device__ __forceinline__ int idx_row_high(int y) const 
+        {
+            return ::min(y, last_row);
+        }
+        __device__ __forceinline__ float idx_row_high(float y) const 
+        {
+            return ::fmin(y, last_row);
+        }
-        template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const 
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+        __device__ __forceinline__ float idx_row(float y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::max(x, 0);
+        }
+        __device__ __forceinline__ float idx_col_low(float x) const
+        {
+            return ::fmax(x, 0);
+        }
+        __device__ __forceinline__ int idx_col_high(int x) const 
+        {
+            return ::min(x, last_col);
+        }
+        __device__ __forceinline__ float idx_col_high(float x) const 
+        {
+            return ::fmin(x, last_col);
+        }
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+        __device__ __forceinline__ float idx_col(float x) const
        {
-            return saturate_cast<D>(*(const D*)((const char*)data + idx_low(i)*step));
+            return idx_col_low(idx_col_high(x));
        }
-        template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const 
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const 
        {
-            return saturate_cast<D>(*(const D*)((const char*)data + idx_high(i)*step));
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
        }
-        const size_t step;
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const 
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
+        const int last_row;
+        const int last_col;
    };
-    template <typename D> struct BrdRowConstant
+    //////////////////////////////////////////////////////////////
+    // BrdReflect101
+    template <typename D> struct BrdRowReflect101
    {
-        explicit __host__ __device__ __forceinline__ BrdRowConstant(int len_, const D& val_ = VecTraits<D>::all(0)): len(len_), val(val_) {}
+        typedef D result_type;
+        explicit __host__ __device__ __forceinline__ BrdRowReflect101(int width) : last_col(width - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdRowReflect101(int width, U) : last_col(width - 1) {}
-        template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const 
+        __device__ __forceinline__ int idx_col_low(int x) const
        {
-            return i >= 0 ? saturate_cast<D>(data[i]) : val;
+            return ::abs(x);
        }
-        template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const 
+        __device__ __forceinline__ int idx_col_high(int x) const 
        {
-            return i < len ? saturate_cast<D>(data[i]) : val;
+            return last_col - ::abs(last_col - x);
+        }
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+        template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_col_low(x)]);
+        }
+        template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_col_high(x)]);
+        }
+        template <typename T> __device__ __forceinline__ D at(int x, const T* data) const 
+        {
+            return saturate_cast<D>(data[idx_col(x)]);
        }
        __host__ __device__ __forceinline__ bool is_range_safe(int mini, int maxi) const 
        {
-            return true;
+            return -last_col <= mini && maxi <= 2 * last_col;
        }
-        const int len;
+        const int last_col;
-        const D val;
    };
-    template <typename D> struct BrdColConstant
+    template <typename D> struct BrdColReflect101
    {
-        __host__ __device__ __forceinline__ BrdColConstant(int len_, size_t step_, const D& val_ = VecTraits<D>::all(0)): len(len_), step(step_), val(val_) {}
+        typedef D result_type;
-        template <typename T> __device__ __forceinline__ D at_low(int i, const T* data) const 
+        explicit __host__ __device__ __forceinline__ BrdColReflect101(int height) : last_row(height - 1) {}
+        template <typename U> __host__ __device__ __forceinline__ BrdColReflect101(int height, U) : last_row(height - 1) {}
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::abs(y);
+        }
+        __device__ __forceinline__ int idx_row_high(int y) const 
        {
-            return i >= 0 ? saturate_cast<D>(*(const D*)((const char*)data + i*step)) : val;
+            return last_row - ::abs(last_row - y);
        }
-        template <typename T> __device__ __forceinline__ D at_high(int i, const T* data) const 
+        __device__ __forceinline__ int idx_row(int y) const
        {
-            return i < len ? saturate_cast<D>(*(const D*)((const char*)data + i*step)) : val;
+            return idx_row_low(idx_row_high(y));
+        }
+        template <typename T> __device__ __forceinline__ D at_low(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_low(y) * step));
+        }
+        template <typename T> __device__ __forceinline__ D at_high(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row_high(y) * step));
+        }
+        template <typename T> __device__ __forceinline__ D at(int y, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(*(const D*)((const char*)data + idx_row(y) * step));
        }
        __host__ __device__ __forceinline__ bool is_range_safe(int mini, int maxi) const 
        {
-            return true;
+            return -last_row <= mini && maxi <= 2 * last_row;
        }
-        const int len;
+        const int last_row;
-        const size_t step;
-        const D val;
    };
-    template <typename OutT> struct BrdConstant
+    template <typename D> struct BrdReflect101
    {
-        __host__ __device__ __forceinline__ BrdConstant(int w, int h, const OutT &val = VecTraits<OutT>::all(0)) : w(w), h(h), val(val) {}
+        typedef D result_type;
+        __host__ __device__ __forceinline__ BrdReflect101(int height, int width) : 
+            last_row(height - 1), last_col(width - 1) 
+        {
+        }
+        template <typename U> 
+        __host__ __device__ __forceinline__ BrdReflect101(int height, int width, U) : 
+            last_row(height - 1), last_col(width - 1) 
+        {
+        }
+        __device__ __forceinline__ int idx_row_low(int y) const
+        {
+            return ::abs(y);
+        }
+        __device__ __forceinline__ float idx_row_low(float y) const
+        {
+            return ::fabs(y);
+        }
+        __device__ __forceinline__ int idx_row_high(int y) const 
+        {
+            return last_row - ::abs(last_row - y);
+        }
+        __device__ __forceinline__ float idx_row_high(float y) const 
+        {
+            return last_row - ::fabs(last_row - y);
+        }
+        __device__ __forceinline__ int idx_row(int y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+        __device__ __forceinline__ float idx_row(float y) const
+        {
+            return idx_row_low(idx_row_high(y));
+        }
+        __device__ __forceinline__ int idx_col_low(int x) const
+        {
+            return ::abs(x);
+        }
+        __device__ __forceinline__ float idx_col_low(float x) const
+        {
+            return ::fabs(x);
+        }
+        __device__ __forceinline__ int idx_col_high(int x) const 
+        {
+            return last_col - ::abs(last_col - x);
+        }
+        __device__ __forceinline__ float idx_col_high(float x) const 
+        {
+            return last_col - ::fabs(last_col - x);
+        }
+        __device__ __forceinline__ int idx_col(int x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+        __device__ __forceinline__ float idx_col(float x) const
+        {
+            return idx_col_low(idx_col_high(x));
+        }
+        template <typename T> __device__ __forceinline__ D at(int y, int x, const T* data, size_t step) const 
+        {
+            return saturate_cast<D>(((const T*)((const char*)data + idx_row(y) * step))[idx_col(x)]);
+        }
+        template <typename Ptr2D> __device__ __forceinline__ D at(typename Ptr2D::index_type y, typename Ptr2D::index_type x, const Ptr2D& src) const 
+        {
+            return saturate_cast<D>(src(idx_row(y), idx_col(x)));
+        }
+        const int last_row;
+        const int last_col;
+    };
+    //////////////////////////////////////////////////////////////
+    // BorderReader
+    template <typename Ptr2D, typename B> struct BorderReader
+    {
+        typedef typename B::result_type elem_type;
+        typedef typename Ptr2D::index_type index_type;
+        __host__ __device__ __forceinline__ BorderReader(const Ptr2D& ptr_, const B& b_) : ptr(ptr_), b(b_) {}
-        __device__ __forceinline__ OutT at(int x, int y, const uchar* data, int step) const
+        __device__ __forceinline__ elem_type operator ()(index_type y, index_type x) const
        {
-            if (x >= 0 && x <= w - 1 && y >= 0 && y <= h - 1)
+            return b.at(y, x, ptr);
-                return ((const OutT*)(data + y * step))[x];
-            return val;
        }
-        const int w;
+        const Ptr2D ptr;
-        const int h;
+        const B b;
-        OutT val;
    };
 }}}

--- a/modules/gpu/src/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp
@@ -309,7 +309,6 @@ namespace cv {  namespace gpu { namespace device
        U vec1Vals[MAX_LEN / THREAD_DIM];
    };
    ///////////////////////////////////////////////////////////////////////////////
    // Solve linear system
@@ -364,6 +363,60 @@ namespace cv {  namespace gpu { namespace device
        return false;
    }
+    ///////////////////////////////////////////////////////////////////////////////
+    // Filters    
+    template <typename Ptr2D> struct PointFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+        explicit __host__ __device__ __forceinline__ PointFilter(const Ptr2D& src_) : src(src_) {}
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            return src(__float2int_rn(y), __float2int_rn(x));
+        }
+        const Ptr2D src;
+    };
+    template <typename Ptr2D> struct LinearFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+        explicit __host__ __device__ __forceinline__ LinearFilter(const Ptr2D& src_) : src(src_) {}
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
+            work_type out = VecTraits<work_type>::all(0);
+            const int x1 = __float2int_rd(x);
+            const int y1 = __float2int_rd(y);
+            const int x2 = x1 + 1;
+            const int y2 = y1 + 1;
+            elem_type src_reg = src(y1, x1);
+            out = out + src_reg * ((x2 - x) * (y2 - y));
+            src_reg = src(y1, x2);
+            out = out + src_reg * ((x - x1) * (y2 - y));
+            src_reg = src(y2, x1);
+            out = out + src_reg * ((x2 - x) * (y - y1));
+            src_reg = src(y2, x2);
+            out = out + src_reg * ((x - x1) * (y - y1));
+            return saturate_cast<elem_type>(out);
+        }
+        const Ptr2D src;
+    };
 }}}
 #endif // __OPENCV_GPU_UTILITY_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
@@ -166,6 +166,7 @@ namespace cv { namespace gpu { namespace device
        enum {cn=1}; \
        static __device__ __host__ __forceinline__ type all(type v) {return v;} \
        static __device__ __host__ __forceinline__ type make(type x) {return x;} \
+        static __device__ __host__ __forceinline__ type make(const type* v) {return *v;} \
    }; \
    template<> struct VecTraits<type ## 1> \
    { \
@@ -173,6 +174,7 @@ namespace cv { namespace gpu { namespace device
        enum {cn=1}; \
        static __device__ __host__ __forceinline__ type ## 1 all(type v) {return make_ ## type ## 1(v);} \
        static __device__ __host__ __forceinline__ type ## 1 make(type x) {return make_ ## type ## 1(x);} \
+        static __device__ __host__ __forceinline__ type ## 1 make(const type* v) {return make_ ## type ## 1(*v);} \
    }; \
    template<> struct VecTraits<type ## 2> \
    { \
@@ -180,6 +182,7 @@ namespace cv { namespace gpu { namespace device
        enum {cn=2}; \
        static __device__ __host__ __forceinline__ type ## 2 all(type v) {return make_ ## type ## 2(v, v);} \
        static __device__ __host__ __forceinline__ type ## 2 make(type x, type y) {return make_ ## type ## 2(x, y);} \
+        static __device__ __host__ __forceinline__ type ## 2 make(const type* v) {return make_ ## type ## 2(v[0], v[1]);} \
    }; \
    template<> struct VecTraits<type ## 3> \
    { \
@@ -187,6 +190,7 @@ namespace cv { namespace gpu { namespace device
        enum {cn=3}; \
        static __device__ __host__ __forceinline__ type ## 3 all(type v) {return make_ ## type ## 3(v, v, v);} \
        static __device__ __host__ __forceinline__ type ## 3 make(type x, type y, type z) {return make_ ## type ## 3(x, y, z);} \
+        static __device__ __host__ __forceinline__ type ## 3 make(const type* v) {return make_ ## type ## 3(v[0], v[1], v[2]);} \
    }; \
    template<> struct VecTraits<type ## 4> \
    { \
@@ -194,6 +198,7 @@ namespace cv { namespace gpu { namespace device
        enum {cn=4}; \
        static __device__ __host__ __forceinline__ type ## 4 all(type v) {return make_ ## type ## 4(v, v, v, v);} \
        static __device__ __host__ __forceinline__ type ## 4 make(type x, type y, type z, type w) {return make_ ## type ## 4(x, y, z, w);} \
+        static __device__ __host__ __forceinline__ type ## 4 make(const type* v) {return make_ ## type ## 4(v[0], v[1], v[2], v[3]);} \
    }; \
    template<> struct VecTraits<type ## 8> \
    { \
@@ -201,10 +206,10 @@ namespace cv { namespace gpu { namespace device
        enum {cn=8}; \
        static __device__ __host__ __forceinline__ type ## 8 all(type v) {return make_ ## type ## 8(v, v, v, v, v, v, v, v);} \
        static __device__ __host__ __forceinline__ type ## 8 make(type a0, type a1, type a2, type a3, type a4, type a5, type a6, type a7) {return make_ ## type ## 8(a0, a1, a2, a3, a4, a5, a6, a7);} \
+        static __device__ __host__ __forceinline__ type ## 8 make(const type* v) {return make_ ## type ## 8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);} \
    };
    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(char)
    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(ushort)
    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(short)
    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(int)
@@ -214,12 +219,61 @@ namespace cv { namespace gpu { namespace device
 #undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS
+    template<> struct VecTraits<char> 
+    { 
+        typedef char elem_type; 
+        enum {cn=1}; 
+        static __device__ __host__ __forceinline__ char all(char v) {return v;}
+        static __device__ __host__ __forceinline__ char make(char x) {return x;}
+        static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}
+    };
    template<> struct VecTraits<schar> 
    { 
        typedef schar elem_type; 
        enum {cn=1}; 
        static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
        static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
+        static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}
+    };
+    template<> struct VecTraits<char1>
+    {
+        typedef schar elem_type;
+        enum {cn=1};
+        static __device__ __host__ __forceinline__ char1 all(schar v) {return make_char1(v);}
+        static __device__ __host__ __forceinline__ char1 make(schar x) {return make_char1(x);}
+        static __device__ __host__ __forceinline__ char1 make(const schar* v) {return make_char1(v[0]);}
+    };
+    template<> struct VecTraits<char2>
+    {
+        typedef schar elem_type;
+        enum {cn=2};
+        static __device__ __host__ __forceinline__ char2 all(schar v) {return make_char2(v, v);}
+        static __device__ __host__ __forceinline__ char2 make(schar x, schar y) {return make_char2(x, y);}
+        static __device__ __host__ __forceinline__ char2 make(const schar* v) {return make_char2(v[0], v[1]);}
+    };
+    template<> struct VecTraits<char3>
+    {
+        typedef schar elem_type;
+        enum {cn=3};
+        static __device__ __host__ __forceinline__ char3 all(schar v) {return make_char3(v, v, v);}
+        static __device__ __host__ __forceinline__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}
+        static __device__ __host__ __forceinline__ char3 make(const schar* v) {return make_char3(v[0], v[1], v[2]);}
+    };
+    template<> struct VecTraits<char4>
+    {
+        typedef schar elem_type;
+        enum {cn=4};
+        static __device__ __host__ __forceinline__ char4 all(schar v) {return make_char4(v, v, v, v);}
+        static __device__ __host__ __forceinline__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}
+        static __device__ __host__ __forceinline__ char4 make(const schar* v) {return make_char4(v[0], v[1], v[2], v[3]);}
+    };
+    template<> struct VecTraits<char8>
+    {
+        typedef schar elem_type;
+        enum {cn=8};
+        static __device__ __host__ __forceinline__ char8 all(schar v) {return make_char8(v, v, v, v, v, v, v, v);}
+        static __device__ __host__ __forceinline__ char8 make(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) {return make_char8(a0, a1, a2, a3, a4, a5, a6, a7);}
+        static __device__ __host__ __forceinline__ char8 make(const schar* v) {return make_char8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);}
    };
 }}}

--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@@ -181,15 +181,18 @@ INSTANTIATE_TEST_CASE_P(ImgProc, Resize, testing::Combine(
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // remap
-struct Remap : testing::TestWithParam< std::tr1::tuple<cv::gpu::DeviceInfo, int> >
+struct Remap : testing::TestWithParam< std::tr1::tuple<cv::gpu::DeviceInfo, int, int, int> >
 {
    cv::gpu::DeviceInfo devInfo;
    int type;
+    int interpolation;
+    int borderType;
    cv::Size size;
    cv::Mat src;
    cv::Mat xmap;
    cv::Mat ymap;
+    cv::Scalar borderValue;
    cv::Mat dst_gold;
@@ -197,43 +200,83 @@ struct Remap : testing::TestWithParam< std::tr1::tuple<cv::gpu::DeviceInfo, int>
    {
        devInfo = std::tr1::get<0>(GetParam());
        type = std::tr1::get<1>(GetParam());
+        interpolation = std::tr1::get<2>(GetParam());
+        borderType = std::tr1::get<3>(GetParam());
        cv::gpu::setDevice(devInfo.deviceID());
        cv::RNG& rng = cvtest::TS::ptr()->get_rng();
-        size = cv::Size(rng.uniform(20, 150), rng.uniform(20, 150));
+        size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
-        src = cvtest::randomMat(rng, size, type, 0.0, 127.0, false);
+        src = cvtest::randomMat(rng, size, type, 0.0, 256.0, false);
-        xmap = cvtest::randomMat(rng, size, CV_32FC1, 0.0, src.cols - 1, false);
-        ymap = cvtest::randomMat(rng, size, CV_32FC1, 0.0, src.rows - 1, false);
+        xmap.create(size, CV_32FC1);
+        ymap.create(size, CV_32FC1);
+        for (int y = 0; y < src.rows; ++y)
+        {
+            float* xmap_row = xmap.ptr<float>(y);
+            float* ymap_row = ymap.ptr<float>(y);
+            for (int x = 0; x < src.cols; ++x)
+            {
+                xmap_row[x] = src.cols - 1 - x;
+                ymap_row[x] = src.rows - 1 - y;
+            }
+        }
+        borderValue[0] = rng.uniform(0.0, 256.0);
+        borderValue[1] = rng.uniform(0.0, 256.0);
+        borderValue[2] = rng.uniform(0.0, 256.0);
+        borderValue[3] = rng.uniform(0.0, 256.0);
-        cv::remap(src, dst_gold, xmap, ymap, cv::INTER_LINEAR, cv::BORDER_WRAP);
+        cv::remap(src, dst_gold, xmap, ymap, interpolation, borderType, borderValue);
    }
 };
 TEST_P(Remap, Accuracy)
 {
+    static const char* interpolations_str[] = {"INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC"};
+    static const char* borderTypes_str[] = {"BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101"};
+    const char* interpolationStr = interpolations_str[interpolation];
+    const char* borderTypeStr = borderTypes_str[borderType];
    PRINT_PARAM(devInfo);
    PRINT_TYPE(type);
+    PRINT_PARAM(interpolationStr);
+    PRINT_PARAM(borderTypeStr);
    PRINT_PARAM(size);
+    PRINT_PARAM(borderValue);
    cv::Mat dst;
    ASSERT_NO_THROW(
        cv::gpu::GpuMat gpuRes;
-        cv::gpu::remap(cv::gpu::GpuMat(src), gpuRes, cv::gpu::GpuMat(xmap), cv::gpu::GpuMat(ymap));
+        cv::gpu::remap(cv::gpu::GpuMat(src), gpuRes, cv::gpu::GpuMat(xmap), cv::gpu::GpuMat(ymap), interpolation, borderType, borderValue);
        gpuRes.download(dst);
    );
-    EXPECT_MAT_SIMILAR(dst_gold, dst, 0.5);
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
-INSTANTIATE_TEST_CASE_P(ImgProc, Remap, testing::Combine(
+INSTANTIATE_TEST_CASE_P
-                        testing::ValuesIn(devices()), 
+(
-                        testing::Values(CV_8UC1, CV_8UC3)));
+    ImgProc, Remap, testing::Combine
+    (
+        testing::ValuesIn(devices()), 
+        testing::Values
+        (
+            CV_8UC1, CV_8UC3, CV_8UC4,
+            CV_32FC1, CV_32FC3, CV_32FC4
+        ),
+        testing::Values(cv::INTER_NEAREST, cv::INTER_LINEAR),
+        testing::Values(cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT)
+    )
+);
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // copyMakeBorder

--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -79,9 +79,9 @@ TEST(remap)
    Mat src, dst, xmap, ymap;
    gpu::GpuMat d_src, d_dst, d_xmap, d_ymap;
-    for (int size = 1000; size <= 8000; size *= 2)
+    for (int size = 1000; size <= 4000; size *= 2)
    {
-        SUBTEST << "src " << size << " and 8U, 32F maps";
+        SUBTEST << "src " << size << ", 8UC1";
        gen(src, size, size, CV_8UC1, 0, 256);
@@ -101,7 +101,112 @@ TEST(remap)
        dst.create(xmap.size(), src.type());
        CPU_ON;
-        remap(src, dst, xmap, ymap, INTER_LINEAR);
+        remap(src, dst, xmap, ymap, INTER_LINEAR, BORDER_REPLICATE);
+        CPU_OFF;
+        d_src = src;
+        d_xmap = xmap;
+        d_ymap = ymap;
+        d_dst.create(d_xmap.size(), d_src.type());
+        GPU_ON;
+        gpu::remap(d_src, d_dst, d_xmap, d_ymap, INTER_LINEAR, BORDER_REPLICATE);
+        GPU_OFF;
+    }
+    for (int size = 1000; size <= 4000; size *= 2)
+    {
+        SUBTEST << "src " << size << ", 8UC3";
+        gen(src, size, size, CV_8UC3, 0, 256);
+        xmap.create(size, size, CV_32F);
+        ymap.create(size, size, CV_32F);
+        for (int i = 0; i < size; ++i)
+        {
+            float* xmap_row = xmap.ptr<float>(i);
+            float* ymap_row = ymap.ptr<float>(i);
+            for (int j = 0; j < size; ++j)
+            {
+                xmap_row[j] = (j - size * 0.5f) * 0.75f + size * 0.5f;
+                ymap_row[j] = (i - size * 0.5f) * 0.75f + size * 0.5f;
+            }
+        }
+        dst.create(xmap.size(), src.type());
+        CPU_ON;
+        remap(src, dst, xmap, ymap, INTER_LINEAR, BORDER_REPLICATE);
+        CPU_OFF;
+        d_src = src;
+        d_xmap = xmap;
+        d_ymap = ymap;
+        d_dst.create(d_xmap.size(), d_src.type());
+        GPU_ON;
+        gpu::remap(d_src, d_dst, d_xmap, d_ymap, INTER_LINEAR, BORDER_REPLICATE);
+        GPU_OFF;
+    }
+    for (int size = 1000; size <= 4000; size *= 2)
+    {
+        SUBTEST << "src " << size << ", 8UC4";
+        gen(src, size, size, CV_8UC4, 0, 256);
+        xmap.create(size, size, CV_32F);
+        ymap.create(size, size, CV_32F);
+        for (int i = 0; i < size; ++i)
+        {
+            float* xmap_row = xmap.ptr<float>(i);
+            float* ymap_row = ymap.ptr<float>(i);
+            for (int j = 0; j < size; ++j)
+            {
+                xmap_row[j] = (j - size * 0.5f) * 0.75f + size * 0.5f;
+                ymap_row[j] = (i - size * 0.5f) * 0.75f + size * 0.5f;
+            }
+        }
+        dst.create(xmap.size(), src.type());
+        CPU_ON;
+        remap(src, dst, xmap, ymap, INTER_LINEAR, BORDER_REPLICATE);
+        CPU_OFF;
+        d_src = src;
+        d_xmap = xmap;
+        d_ymap = ymap;
+        d_dst.create(d_xmap.size(), d_src.type());
+        GPU_ON;
+        gpu::remap(d_src, d_dst, d_xmap, d_ymap, INTER_LINEAR, BORDER_REPLICATE);
+        GPU_OFF;
+    }
+    for (int size = 1000; size <= 4000; size *= 2)
+    {
+        SUBTEST << "src " << size << ", 16SC3";
+        gen(src, size, size, CV_16SC3, 0, 256);
+        xmap.create(size, size, CV_32F);
+        ymap.create(size, size, CV_32F);
+        for (int i = 0; i < size; ++i)
+        {
+            float* xmap_row = xmap.ptr<float>(i);
+            float* ymap_row = ymap.ptr<float>(i);
+            for (int j = 0; j < size; ++j)
+            {
+                xmap_row[j] = (j - size * 0.5f) * 0.75f + size * 0.5f;
+                ymap_row[j] = (i - size * 0.5f) * 0.75f + size * 0.5f;
+            }
+        }
+        dst.create(xmap.size(), src.type());
+        CPU_ON;
+        remap(src, dst, xmap, ymap, INTER_LINEAR, BORDER_REPLICATE);
        CPU_OFF;
        d_src = src;
@@ -110,7 +215,7 @@ TEST(remap)
        d_dst.create(d_xmap.size(), d_src.type());
        GPU_ON;
-        gpu::remap(d_src, d_dst, d_xmap, d_ymap);
+        gpu::remap(d_src, d_dst, d_xmap, d_ymap, INTER_LINEAR, BORDER_REPLICATE);
        GPU_OFF;
    }
 }