implemented gpu::remap for all types

47d68f69 · Vladislav Vinogradov · 78542854 · 47d68f69 · 47d68f69 · 47d68f69
Commit 47d68f69 authored Aug 31, 2011 by Vladislav Vinogradov
12 changed files
--- a/modules/gpu/include/opencv2/gpu/devmem2d.hpp
+++ b/modules/gpu/include/opencv2/gpu/devmem2d.hpp
@@ -66,6 +66,9 @@ namespace cv
        template <typename T> struct DevMem2D_
        {            
+            typedef T elem_type;
+            typedef int index_type;
            int cols;
            int rows;
            T* data;
@@ -80,7 +83,6 @@ namespace cv
            explicit DevMem2D_(const DevMem2D_<U>& d)
                : cols(d.cols), rows(d.rows), data((T*)d.data), step(d.step) {}
-            typedef T elem_type;
            enum { elem_size = sizeof(elem_type) };
            __CV_GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; }
@@ -89,6 +91,9 @@ namespace cv
 			__CV_GPU_HOST_DEVICE__ operator T*() const { return data; }
+            __CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; }
+            __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
 #if defined(__DEVCLASES_ADD_THRUST_BEGIN_END__)    
            thrust::device_ptr<T> begin() const { return thrust::device_ptr<T>(data); }
            thrust::device_ptr<T> end() const { return thrust::device_ptr<T>(data) + cols * rows; }
@@ -97,19 +102,24 @@ namespace cv
        template<typename T> struct PtrStep_
        {
+            typedef T elem_type;
+            typedef int index_type;
            T* data;
            size_t step;
            PtrStep_() : data(0), step(0) {}            
            PtrStep_(const DevMem2D_<T>& mem) : data(mem.data), step(mem.step) {}
-            typedef T elem_type;
            enum { elem_size = sizeof(elem_type) };
            __CV_GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; }
            __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return (T*)( (char*)data + y * step); }
            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)data + y * step); }
+            __CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; }
+            __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }
 #if defined(__DEVCLASES_ADD_THRUST_BEGIN_END__)    
            thrust::device_ptr<T> begin() const { return thrust::device_ptr<T>(data); }
 #endif
@@ -125,6 +135,9 @@ namespace cv
            }
            __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0) { return PtrStep_<T>::data + y * PtrStep_<T>::step; }
            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return PtrStep_<T>::data + y * PtrStep_<T>::step; }  
+            __CV_GPU_HOST_DEVICE__ T& operator ()(int y, int x) { return ptr(y)[x]; }
+            __CV_GPU_HOST_DEVICE__ const T& operator ()(int y, int x) const { return ptr(y)[x]; }                  
        };
        typedef DevMem2D_<unsigned char> DevMem2D;

--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -596,8 +596,9 @@ namespace cv
        ////////////////////////////// Image processing //////////////////////////////
        //! DST[x,y] = SRC[xmap[x,y],ymap[x,y]] with bilinear interpolation.
-        //! supports CV_8UC1, CV_8UC3 source types and CV_32FC1 map type
+        //! supports CV_32FC1 map type
-        CV_EXPORTS void remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap);
+        CV_EXPORTS void remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const GpuMat& ymap,
+            int interpolation, int borderMode = BORDER_CONSTANT, const Scalar& borderValue = Scalar());
        //! Does mean shift filtering on GPU.
        CV_EXPORTS void meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
@@ -761,10 +762,10 @@ namespace cv
        CV_EXPORTS void upsample(const GpuMat& src, GpuMat &dst, Stream& stream = Stream::Null());
        //! smoothes the source image and downsamples it
-        CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+        CV_EXPORTS void pyrDown(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
        //! upsamples the source image and then smoothes it
-        CV_EXPORTS void pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream = Stream::Null());
+        CV_EXPORTS void pyrUp(const GpuMat& src, GpuMat& dst, int borderType = BORDER_DEFAULT, Stream& stream = Stream::Null());
        //! performs linear blending of two images
        //! to avoid accuracy errors sum of weigths shouldn't be very close to zero

--- a/modules/gpu/src/cuda/filters.cu
+++ b/modules/gpu/src/cuda/filters.cu
@@ -242,9 +242,9 @@ namespace filter_krnls
        {
            const T* srcCol = src.ptr() + x;
-            sDataColumn[ threadIdx.y                    * BLOCK_DIM_X] = b.at_low(y - BLOCK_DIM_Y, srcCol);
+            sDataColumn[ threadIdx.y                    * BLOCK_DIM_X] = b.at_low(y - BLOCK_DIM_Y, srcCol, src.step);
-            sDataColumn[(threadIdx.y + BLOCK_DIM_Y)     * BLOCK_DIM_X] = b.at_high(y, srcCol);
+            sDataColumn[(threadIdx.y + BLOCK_DIM_Y)     * BLOCK_DIM_X] = b.at_high(y, srcCol, src.step);
-            sDataColumn[(threadIdx.y + BLOCK_DIM_Y * 2) * BLOCK_DIM_X] = b.at_high(y + BLOCK_DIM_Y, srcCol);
+            sDataColumn[(threadIdx.y + BLOCK_DIM_Y * 2) * BLOCK_DIM_X] = b.at_high(y + BLOCK_DIM_Y, srcCol, src.step);
            __syncthreads();
@@ -273,7 +273,7 @@ namespace cv { namespace gpu { namespace filters
        dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
        dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y));
-        B<T> b(src.rows, src.step);
+        B<T> b(src.rows);
        if (!b.is_range_safe(-BLOCK_DIM_Y, (grid.y + 1) * BLOCK_DIM_Y - 1))
        {

--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
@@ -675,32 +675,30 @@ namespace cv { namespace gpu { namespace surf
        3.695352233989979e-006f, 8.444558261544444e-006f, 1.760426494001877e-005f, 3.34794785885606e-005f, 5.808438800158911e-005f, 9.193058212986216e-005f, 0.0001327334757661447f, 0.0001748319627949968f, 0.0002100782439811155f, 0.0002302826324012131f, 0.0002302826324012131f, 0.0002100782439811155f, 0.0001748319627949968f, 0.0001327334757661447f, 9.193058212986216e-005f, 5.808438800158911e-005f, 3.34794785885606e-005f, 1.760426494001877e-005f, 8.444558261544444e-006f, 3.695352233989979e-006f
    };
-    __device__ __forceinline__ unsigned char calcWin(int i, int j, float centerX, float centerY, float win_offset, float cos_dir, float sin_dir)
+    struct WinReader
    {
-        float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
+        typedef uchar elem_type;
-        float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
-        return tex2D(imgTex, pixel_x, pixel_y);
+        __device__ __forceinline__ WinReader(float centerX_, float centerY_, float win_offset_, float cos_dir_, float sin_dir_) : 
+            centerX(centerX_), centerY(centerY_), win_offset(win_offset_), cos_dir(cos_dir_), sin_dir(sin_dir_)
+        {
        }
-    __device__ unsigned char calcPATCH(int i1, int j1, float centerX, float centerY, float win_offset, float cos_dir, float sin_dir, int win_size)
+        __device__ __forceinline__ uchar operator ()(int i, int j) const
        {
-        /* Scale the window to size PATCH_SZ so each pixel's size is s. This
+            float pixel_x = centerX + (win_offset + j) * cos_dir + (win_offset + i) * sin_dir;
-           makes calculating the gradients with wavelets of size 2s easy */
+            float pixel_y = centerY - (win_offset + j) * sin_dir + (win_offset + i) * cos_dir;
-        const float icoo = ((float)i1 / (PATCH_SZ + 1)) * win_size;
-        const float jcoo = ((float)j1 / (PATCH_SZ + 1)) * win_size;
-        const int i = __float2int_rd(icoo);
-        const int j = __float2int_rd(jcoo);
-        float res = calcWin(i, j, centerX, centerY, win_offset, cos_dir, sin_dir) * (i + 1 - icoo) * (j + 1 - jcoo);
-        res += calcWin(i + 1, j, centerX, centerY, win_offset, cos_dir, sin_dir) * (icoo - i) * (j + 1 - jcoo);
-        res += calcWin(i + 1, j + 1, centerX, centerY, win_offset, cos_dir, sin_dir) * (icoo - i) * (jcoo - j);
-        res += calcWin(i, j + 1, centerX, centerY, win_offset, cos_dir, sin_dir) * (i + 1 - icoo) * (jcoo - j);
-        return saturate_cast<unsigned char>(res);
+            return tex2D(imgTex, pixel_x, pixel_y);
        }
+        float centerX; 
+        float centerY;
+        float win_offset; 
+        float cos_dir; 
+        float sin_dir;
+    };
    __device__ void calc_dx_dy(float s_dx_bin[25], float s_dy_bin[25], 
        const float* featureX, const float* featureY, const float* featureSize, const float* featureDir)
    {
@@ -732,7 +730,13 @@ namespace cv { namespace gpu { namespace surf
        const int xIndex = xBlock * 5 + threadIdx.x;
        const int yIndex = yBlock * 5 + threadIdx.y;
-        s_PATCH[threadIdx.y][threadIdx.x] = calcPATCH(yIndex, xIndex, centerX, centerY, win_offset, cos_dir, sin_dir, win_size);
+        const float icoo = ((float)yIndex / (PATCH_SZ + 1)) * win_size;
+        const float jcoo = ((float)xIndex / (PATCH_SZ + 1)) * win_size;
+        LinearFilter<WinReader> filter(WinReader(centerX, centerY, win_offset, cos_dir, sin_dir));
+        s_PATCH[threadIdx.y][threadIdx.x] = filter(icoo, jcoo);
        __syncthreads();
        if (threadIdx.x < 5 && threadIdx.y < 5)

--- a/modules/gpu/src/gpumat.cpp
+++ b/modules/gpu/src/gpumat.cpp
@@ -885,7 +885,7 @@ void cv::gpu::GpuMat::release()
    if( refcount && CV_XADD(refcount, -1) == 1 )
    {
        fastFree(refcount);
-        cudaSafeCall( cudaFree(datastart) );
+        cudaFree(datastart);
    }
    data = datastart = dataend = 0;
    step = rows = cols = 0;

--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
--- a/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
--- a/modules/gpu/src/opencv2/gpu/device/utility.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/utility.hpp
@@ -310,7 +310,6 @@ namespace cv {  namespace gpu { namespace device
        U vec1Vals[MAX_LEN / THREAD_DIM];
    };
    ///////////////////////////////////////////////////////////////////////////////
    // Solve linear system
@@ -364,6 +363,60 @@ namespace cv {  namespace gpu { namespace device
        return false;
    }
+    ///////////////////////////////////////////////////////////////////////////////
+    // Filters    
+    template <typename Ptr2D> struct PointFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+        explicit __host__ __device__ __forceinline__ PointFilter(const Ptr2D& src_) : src(src_) {}
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            return src(__float2int_rn(y), __float2int_rn(x));
+        }
+        const Ptr2D src;
+    };
+    template <typename Ptr2D> struct LinearFilter
+    {
+        typedef typename Ptr2D::elem_type elem_type;
+        typedef float index_type;
+        explicit __host__ __device__ __forceinline__ LinearFilter(const Ptr2D& src_) : src(src_) {}
+        __device__ __forceinline__ elem_type operator ()(float y, float x) const
+        {
+            typedef typename TypeVec<float, VecTraits<elem_type>::cn>::vec_type work_type;
+            work_type out = VecTraits<work_type>::all(0);
+            const int x1 = __float2int_rd(x);
+            const int y1 = __float2int_rd(y);
+            const int x2 = x1 + 1;
+            const int y2 = y1 + 1;
+            elem_type src_reg = src(y1, x1);
+            out = out + src_reg * ((x2 - x) * (y2 - y));
+            src_reg = src(y1, x2);
+            out = out + src_reg * ((x - x1) * (y2 - y));
+            src_reg = src(y2, x1);
+            out = out + src_reg * ((x2 - x) * (y - y1));
+            src_reg = src(y2, x2);
+            out = out + src_reg * ((x - x1) * (y - y1));
+            return saturate_cast<elem_type>(out);
+        }
+        const Ptr2D src;
+    };
 }}}
 #endif // __OPENCV_GPU_UTILITY_HPP__
--- a/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/vec_traits.hpp
@@ -166,6 +166,7 @@ namespace cv { namespace gpu { namespace device
        enum {cn=1}; \
        static __device__ __host__ __forceinline__ type all(type v) {return v;} \
        static __device__ __host__ __forceinline__ type make(type x) {return x;} \
+        static __device__ __host__ __forceinline__ type make(const type* v) {return *v;} \
    }; \
    template<> struct VecTraits<type ## 1> \
    { \
@@ -173,6 +174,7 @@ namespace cv { namespace gpu { namespace device
        enum {cn=1}; \
        static __device__ __host__ __forceinline__ type ## 1 all(type v) {return make_ ## type ## 1(v);} \
        static __device__ __host__ __forceinline__ type ## 1 make(type x) {return make_ ## type ## 1(x);} \
+        static __device__ __host__ __forceinline__ type ## 1 make(const type* v) {return make_ ## type ## 1(*v);} \
    }; \
    template<> struct VecTraits<type ## 2> \
    { \
@@ -180,6 +182,7 @@ namespace cv { namespace gpu { namespace device
        enum {cn=2}; \
        static __device__ __host__ __forceinline__ type ## 2 all(type v) {return make_ ## type ## 2(v, v);} \
        static __device__ __host__ __forceinline__ type ## 2 make(type x, type y) {return make_ ## type ## 2(x, y);} \
+        static __device__ __host__ __forceinline__ type ## 2 make(const type* v) {return make_ ## type ## 2(v[0], v[1]);} \
    }; \
    template<> struct VecTraits<type ## 3> \
    { \
@@ -187,6 +190,7 @@ namespace cv { namespace gpu { namespace device
        enum {cn=3}; \
        static __device__ __host__ __forceinline__ type ## 3 all(type v) {return make_ ## type ## 3(v, v, v);} \
        static __device__ __host__ __forceinline__ type ## 3 make(type x, type y, type z) {return make_ ## type ## 3(x, y, z);} \
+        static __device__ __host__ __forceinline__ type ## 3 make(const type* v) {return make_ ## type ## 3(v[0], v[1], v[2]);} \
    }; \
    template<> struct VecTraits<type ## 4> \
    { \
@@ -194,6 +198,7 @@ namespace cv { namespace gpu { namespace device
        enum {cn=4}; \
        static __device__ __host__ __forceinline__ type ## 4 all(type v) {return make_ ## type ## 4(v, v, v, v);} \
        static __device__ __host__ __forceinline__ type ## 4 make(type x, type y, type z, type w) {return make_ ## type ## 4(x, y, z, w);} \
+        static __device__ __host__ __forceinline__ type ## 4 make(const type* v) {return make_ ## type ## 4(v[0], v[1], v[2], v[3]);} \
    }; \
    template<> struct VecTraits<type ## 8> \
    { \
@@ -201,10 +206,10 @@ namespace cv { namespace gpu { namespace device
        enum {cn=8}; \
        static __device__ __host__ __forceinline__ type ## 8 all(type v) {return make_ ## type ## 8(v, v, v, v, v, v, v, v);} \
        static __device__ __host__ __forceinline__ type ## 8 make(type a0, type a1, type a2, type a3, type a4, type a5, type a6, type a7) {return make_ ## type ## 8(a0, a1, a2, a3, a4, a5, a6, a7);} \
+        static __device__ __host__ __forceinline__ type ## 8 make(const type* v) {return make_ ## type ## 8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);} \
    };
    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(uchar)
-    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(char)
    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(ushort)
    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(short)
    OPENCV_GPU_IMPLEMENT_VEC_TRAITS(int)
@@ -214,12 +219,61 @@ namespace cv { namespace gpu { namespace device
 #undef OPENCV_GPU_IMPLEMENT_VEC_TRAITS
+    template<> struct VecTraits<char> 
+    { 
+        typedef char elem_type; 
+        enum {cn=1}; 
+        static __device__ __host__ __forceinline__ char all(char v) {return v;}
+        static __device__ __host__ __forceinline__ char make(char x) {return x;}
+        static __device__ __host__ __forceinline__ char make(const char* x) {return *x;}
+    };
    template<> struct VecTraits<schar> 
    { 
        typedef schar elem_type; 
        enum {cn=1}; 
        static __device__ __host__ __forceinline__ schar all(schar v) {return v;}
        static __device__ __host__ __forceinline__ schar make(schar x) {return x;}
+        static __device__ __host__ __forceinline__ schar make(const schar* x) {return *x;}
+    };
+    template<> struct VecTraits<char1>
+    {
+        typedef schar elem_type;
+        enum {cn=1};
+        static __device__ __host__ __forceinline__ char1 all(schar v) {return make_char1(v);}
+        static __device__ __host__ __forceinline__ char1 make(schar x) {return make_char1(x);}
+        static __device__ __host__ __forceinline__ char1 make(const schar* v) {return make_char1(v[0]);}
+    };
+    template<> struct VecTraits<char2>
+    {
+        typedef schar elem_type;
+        enum {cn=2};
+        static __device__ __host__ __forceinline__ char2 all(schar v) {return make_char2(v, v);}
+        static __device__ __host__ __forceinline__ char2 make(schar x, schar y) {return make_char2(x, y);}
+        static __device__ __host__ __forceinline__ char2 make(const schar* v) {return make_char2(v[0], v[1]);}
+    };
+    template<> struct VecTraits<char3>
+    {
+        typedef schar elem_type;
+        enum {cn=3};
+        static __device__ __host__ __forceinline__ char3 all(schar v) {return make_char3(v, v, v);}
+        static __device__ __host__ __forceinline__ char3 make(schar x, schar y, schar z) {return make_char3(x, y, z);}
+        static __device__ __host__ __forceinline__ char3 make(const schar* v) {return make_char3(v[0], v[1], v[2]);}
+    };
+    template<> struct VecTraits<char4>
+    {
+        typedef schar elem_type;
+        enum {cn=4};
+        static __device__ __host__ __forceinline__ char4 all(schar v) {return make_char4(v, v, v, v);}
+        static __device__ __host__ __forceinline__ char4 make(schar x, schar y, schar z, schar w) {return make_char4(x, y, z, w);}
+        static __device__ __host__ __forceinline__ char4 make(const schar* v) {return make_char4(v[0], v[1], v[2], v[3]);}
+    };
+    template<> struct VecTraits<char8>
+    {
+        typedef schar elem_type;
+        enum {cn=8};
+        static __device__ __host__ __forceinline__ char8 all(schar v) {return make_char8(v, v, v, v, v, v, v, v);}
+        static __device__ __host__ __forceinline__ char8 make(schar a0, schar a1, schar a2, schar a3, schar a4, schar a5, schar a6, schar a7) {return make_char8(a0, a1, a2, a3, a4, a5, a6, a7);}
+        static __device__ __host__ __forceinline__ char8 make(const schar* v) {return make_char8(v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);}
    };
 }}}

--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
@@ -181,15 +181,18 @@ INSTANTIATE_TEST_CASE_P(ImgProc, Resize, testing::Combine(
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // remap
-struct Remap : testing::TestWithParam< std::tr1::tuple<cv::gpu::DeviceInfo, int> >
+struct Remap : testing::TestWithParam< std::tr1::tuple<cv::gpu::DeviceInfo, int, int, int> >
 {
    cv::gpu::DeviceInfo devInfo;
    int type;
+    int interpolation;
+    int borderType;
    cv::Size size;
    cv::Mat src;
    cv::Mat xmap;
    cv::Mat ymap;
+    cv::Scalar borderValue;
    cv::Mat dst_gold;
@@ -197,43 +200,83 @@ struct Remap : testing::TestWithParam< std::tr1::tuple<cv::gpu::DeviceInfo, int>
    {
        devInfo = std::tr1::get<0>(GetParam());
        type = std::tr1::get<1>(GetParam());
+        interpolation = std::tr1::get<2>(GetParam());
+        borderType = std::tr1::get<3>(GetParam());
        cv::gpu::setDevice(devInfo.deviceID());
        cv::RNG& rng = cvtest::TS::ptr()->get_rng();
-        size = cv::Size(rng.uniform(20, 150), rng.uniform(20, 150));
+        size = cv::Size(rng.uniform(100, 200), rng.uniform(100, 200));
-        src = cvtest::randomMat(rng, size, type, 0.0, 127.0, false);
+        src = cvtest::randomMat(rng, size, type, 0.0, 256.0, false);
-        xmap = cvtest::randomMat(rng, size, CV_32FC1, 0.0, src.cols - 1, false);
-        ymap = cvtest::randomMat(rng, size, CV_32FC1, 0.0, src.rows - 1, false);
+        xmap.create(size, CV_32FC1);
+        ymap.create(size, CV_32FC1);
+        for (int y = 0; y < src.rows; ++y)
+        {
+            float* xmap_row = xmap.ptr<float>(y);
+            float* ymap_row = ymap.ptr<float>(y);
+            for (int x = 0; x < src.cols; ++x)
+            {
+                xmap_row[x] = src.cols - 1 - x;
+                ymap_row[x] = src.rows - 1 - y;
+            }
+        }
+        borderValue[0] = rng.uniform(0.0, 256.0);
+        borderValue[1] = rng.uniform(0.0, 256.0);
+        borderValue[2] = rng.uniform(0.0, 256.0);
+        borderValue[3] = rng.uniform(0.0, 256.0);
-        cv::remap(src, dst_gold, xmap, ymap, cv::INTER_LINEAR, cv::BORDER_WRAP);
+        cv::remap(src, dst_gold, xmap, ymap, interpolation, borderType, borderValue);
    }
 };
 TEST_P(Remap, Accuracy)
 {
+    static const char* interpolations_str[] = {"INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC"};
+    static const char* borderTypes_str[] = {"BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101"};
+    const char* interpolationStr = interpolations_str[interpolation];
+    const char* borderTypeStr = borderTypes_str[borderType];
    PRINT_PARAM(devInfo);
    PRINT_TYPE(type);
+    PRINT_PARAM(interpolationStr);
+    PRINT_PARAM(borderTypeStr);
    PRINT_PARAM(size);
+    PRINT_PARAM(borderValue);
    cv::Mat dst;
    ASSERT_NO_THROW(
        cv::gpu::GpuMat gpuRes;
-        cv::gpu::remap(cv::gpu::GpuMat(src), gpuRes, cv::gpu::GpuMat(xmap), cv::gpu::GpuMat(ymap));
+        cv::gpu::remap(cv::gpu::GpuMat(src), gpuRes, cv::gpu::GpuMat(xmap), cv::gpu::GpuMat(ymap), interpolation, borderType, borderValue);
        gpuRes.download(dst);
    );
-    EXPECT_MAT_SIMILAR(dst_gold, dst, 0.5);
+    EXPECT_MAT_NEAR(dst_gold, dst, 1e-5);
 }
-INSTANTIATE_TEST_CASE_P(ImgProc, Remap, testing::Combine(
+INSTANTIATE_TEST_CASE_P
+(
+    ImgProc, Remap, testing::Combine
+    (
        testing::ValuesIn(devices()), 
-                        testing::Values(CV_8UC1, CV_8UC3)));
+        testing::Values
+        (
+            CV_8UC1, CV_8UC3, CV_8UC4,
+            CV_32FC1, CV_32FC3, CV_32FC4
+        ),
+        testing::Values(cv::INTER_NEAREST, cv::INTER_LINEAR),
+        testing::Values(cv::BORDER_REFLECT101, cv::BORDER_REPLICATE, cv::BORDER_CONSTANT)
+    )
+);
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 // copyMakeBorder

--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -79,9 +79,9 @@ TEST(remap)
    Mat src, dst, xmap, ymap;
    gpu::GpuMat d_src, d_dst, d_xmap, d_ymap;
-    for (int size = 1000; size <= 8000; size *= 2)
+    for (int size = 1000; size <= 4000; size *= 2)
    {
-        SUBTEST << "src " << size << " and 8U, 32F maps";
+        SUBTEST << "src " << size << ", 8UC1";
        gen(src, size, size, CV_8UC1, 0, 256);
@@ -101,7 +101,112 @@ TEST(remap)
        dst.create(xmap.size(), src.type());
        CPU_ON;
-        remap(src, dst, xmap, ymap, INTER_LINEAR);
+        remap(src, dst, xmap, ymap, INTER_LINEAR, BORDER_REPLICATE);
+        CPU_OFF;
+        d_src = src;
+        d_xmap = xmap;
+        d_ymap = ymap;
+        d_dst.create(d_xmap.size(), d_src.type());
+        GPU_ON;
+        gpu::remap(d_src, d_dst, d_xmap, d_ymap, INTER_LINEAR, BORDER_REPLICATE);
+        GPU_OFF;
+    }
+    for (int size = 1000; size <= 4000; size *= 2)
+    {
+        SUBTEST << "src " << size << ", 8UC3";
+        gen(src, size, size, CV_8UC3, 0, 256);
+        xmap.create(size, size, CV_32F);
+        ymap.create(size, size, CV_32F);
+        for (int i = 0; i < size; ++i)
+        {
+            float* xmap_row = xmap.ptr<float>(i);
+            float* ymap_row = ymap.ptr<float>(i);
+            for (int j = 0; j < size; ++j)
+            {
+                xmap_row[j] = (j - size * 0.5f) * 0.75f + size * 0.5f;
+                ymap_row[j] = (i - size * 0.5f) * 0.75f + size * 0.5f;
+            }
+        }
+        dst.create(xmap.size(), src.type());
+        CPU_ON;
+        remap(src, dst, xmap, ymap, INTER_LINEAR, BORDER_REPLICATE);
+        CPU_OFF;
+        d_src = src;
+        d_xmap = xmap;
+        d_ymap = ymap;
+        d_dst.create(d_xmap.size(), d_src.type());
+        GPU_ON;
+        gpu::remap(d_src, d_dst, d_xmap, d_ymap, INTER_LINEAR, BORDER_REPLICATE);
+        GPU_OFF;
+    }
+    for (int size = 1000; size <= 4000; size *= 2)
+    {
+        SUBTEST << "src " << size << ", 8UC4";
+        gen(src, size, size, CV_8UC4, 0, 256);
+        xmap.create(size, size, CV_32F);
+        ymap.create(size, size, CV_32F);
+        for (int i = 0; i < size; ++i)
+        {
+            float* xmap_row = xmap.ptr<float>(i);
+            float* ymap_row = ymap.ptr<float>(i);
+            for (int j = 0; j < size; ++j)
+            {
+                xmap_row[j] = (j - size * 0.5f) * 0.75f + size * 0.5f;
+                ymap_row[j] = (i - size * 0.5f) * 0.75f + size * 0.5f;
+            }
+        }
+        dst.create(xmap.size(), src.type());
+        CPU_ON;
+        remap(src, dst, xmap, ymap, INTER_LINEAR, BORDER_REPLICATE);
+        CPU_OFF;
+        d_src = src;
+        d_xmap = xmap;
+        d_ymap = ymap;
+        d_dst.create(d_xmap.size(), d_src.type());
+        GPU_ON;
+        gpu::remap(d_src, d_dst, d_xmap, d_ymap, INTER_LINEAR, BORDER_REPLICATE);
+        GPU_OFF;
+    }
+    for (int size = 1000; size <= 4000; size *= 2)
+    {
+        SUBTEST << "src " << size << ", 16SC3";
+        gen(src, size, size, CV_16SC3, 0, 256);
+        xmap.create(size, size, CV_32F);
+        ymap.create(size, size, CV_32F);
+        for (int i = 0; i < size; ++i)
+        {
+            float* xmap_row = xmap.ptr<float>(i);
+            float* ymap_row = ymap.ptr<float>(i);
+            for (int j = 0; j < size; ++j)
+            {
+                xmap_row[j] = (j - size * 0.5f) * 0.75f + size * 0.5f;
+                ymap_row[j] = (i - size * 0.5f) * 0.75f + size * 0.5f;
+            }
+        }
+        dst.create(xmap.size(), src.type());
+        CPU_ON;
+        remap(src, dst, xmap, ymap, INTER_LINEAR, BORDER_REPLICATE);
        CPU_OFF;
        d_src = src;
@@ -110,7 +215,7 @@ TEST(remap)
        d_dst.create(d_xmap.size(), d_src.type());
        GPU_ON;
-        gpu::remap(d_src, d_dst, d_xmap, d_ymap);
+        gpu::remap(d_src, d_dst, d_xmap, d_ymap, INTER_LINEAR, BORDER_REPLICATE);
        GPU_OFF;
    }
 }