Removed PtrElemStep, Marked DevMem1D as deprecated, now should use PtrStepSz now

70204a8e · Anatoly Baksheev · 9be63c50 · 70204a8e · 70204a8e · 70204a8e
Commit 70204a8e authored Aug 23, 2012 by Anatoly Baksheev
95 changed files
--- a/modules/core/include/opencv2/core/devmem2d.hpp
+++ b/modules/core/include/opencv2/core/devmem2d.hpp
--- a/modules/core/include/opencv2/core/gpumat.hpp
+++ b/modules/core/include/opencv2/core/gpumat.hpp
@@ -46,7 +46,7 @@
 #ifdef __cplusplus
 #include "opencv2/core/core.hpp"
-#include "opencv2/core/devmem2d.hpp"
+#include "opencv2/core/cuda_devptrs.hpp"
 namespace cv { namespace gpu
 {
@@ -268,10 +268,14 @@ namespace cv { namespace gpu
        template<typename _Tp> _Tp* ptr(int y = 0);
        template<typename _Tp> const _Tp* ptr(int y = 0) const;
-        template <typename _Tp> operator DevMem2D_<_Tp>() const;
+        template <typename _Tp> operator PtrStepSz<_Tp>() const;
-        template <typename _Tp> operator PtrStep_<_Tp>() const;
        template <typename _Tp> operator PtrStep<_Tp>() const;
+        // Deprecated function
+        __CV_GPU_DEPR_BEFORE__ template <typename _Tp> operator DevMem2D_<_Tp>() const __CV_GPU_DEPR_AFTER__;
+        #undef __CV_GPU_DEPR_BEFORE__
+        #undef __CV_GPU_DEPR_AFTER__
        /*! includes several bit-fields:
        - the magic signature
        - continuity flag
@@ -502,19 +506,19 @@ namespace cv { namespace gpu
        return *this;
    }
-    template <class T> inline GpuMat::operator DevMem2D_<T>() const
+    template <class T> inline GpuMat::operator PtrStepSz<T>() const
    {
-        return DevMem2D_<T>(rows, cols, (T*)data, step);
+        return PtrStepSz<T>(rows, cols, (T*)data, step);
    }
-    template <class T> inline GpuMat::operator PtrStep_<T>() const
+    template <class T> inline GpuMat::operator PtrStep<T>() const
    {
-        return PtrStep_<T>(static_cast< DevMem2D_<T> >(*this));
+        return PtrStep<T>((T*)data, step);
    }
-    template <class T> inline GpuMat::operator PtrStep<T>() const
+    template <class T> inline GpuMat::operator DevMem2D_<T>() const
    {
-        return PtrStep<T>((T*)data, step);
+        return DevMem2D_<T>(rows, cols, (T*)data, step);
    }
    inline GpuMat createContinuous(int rows, int cols, int type)

--- a/modules/core/src/cuda/matrix_operations.cu
+++ b/modules/core/src/cuda/matrix_operations.cu
@@ -44,6 +44,18 @@
 #include "opencv2/gpu/device/transform.hpp"
 #include "opencv2/gpu/device/functional.hpp"
+namespace cv { namespace gpu { namespace device
+{
+    void writeScalar(const uchar*);
+    void writeScalar(const schar*);
+    void writeScalar(const ushort*);
+    void writeScalar(const short int*);
+    void writeScalar(const int*);
+    void writeScalar(const float*);
+    void writeScalar(const double*);
+    void convert_gpu(PtrStepSzb, int, PtrStepSzb, int, double, double, cudaStream_t);
+}}}
 namespace cv { namespace gpu { namespace device
 {
    template <typename T> struct shift_and_sizeof;
@@ -59,17 +71,17 @@ namespace cv { namespace gpu { namespace device
    ////////////////////////////////// CopyTo /////////////////////////////////
    ///////////////////////////////////////////////////////////////////////////
-    template <typename T> void copyToWithMask(DevMem2Db src, DevMem2Db dst, int cn, DevMem2Db mask, bool colorMask, cudaStream_t stream)
+    template <typename T> void copyToWithMask(PtrStepSzb src, PtrStepSzb dst, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream)
    {
        if (colorMask)
-            cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<T>)dst, identity<T>(), SingleMask(mask), stream);
+            cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, identity<T>(), SingleMask(mask), stream);
        else
-            cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<T>)dst, identity<T>(), SingleMaskChannels(mask, cn), stream);
+            cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<T>)dst, identity<T>(), SingleMaskChannels(mask, cn), stream);
    }
-    void copyToWithMask_gpu(DevMem2Db src, DevMem2Db dst, size_t elemSize1, int cn, DevMem2Db mask, bool colorMask, cudaStream_t stream)
+    void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream)
    {
-        typedef void (*func_t)(DevMem2Db src, DevMem2Db dst, int cn, DevMem2Db mask, bool colorMask, cudaStream_t stream);
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
        static func_t tab[] =
        {
@@ -164,7 +176,7 @@ namespace cv { namespace gpu { namespace device
            }
    }
    template <typename T>
-    void set_to_gpu(DevMem2Db mat, const T* scalar, DevMem2Db mask, int channels, cudaStream_t stream)
+    void set_to_gpu(PtrStepSzb mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream)
    {
        writeScalar(scalar);
@@ -178,16 +190,16 @@ namespace cv { namespace gpu { namespace device
            cudaSafeCall ( cudaDeviceSynchronize() );
    }
-    template void set_to_gpu<uchar >(DevMem2Db mat, const uchar*  scalar, DevMem2Db mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<uchar >(PtrStepSzb mat, const uchar*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<schar >(DevMem2Db mat, const schar*  scalar, DevMem2Db mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<schar >(PtrStepSzb mat, const schar*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<ushort>(DevMem2Db mat, const ushort* scalar, DevMem2Db mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<ushort>(PtrStepSzb mat, const ushort* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<short >(DevMem2Db mat, const short*  scalar, DevMem2Db mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<short >(PtrStepSzb mat, const short*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<int   >(DevMem2Db mat, const int*    scalar, DevMem2Db mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<int   >(PtrStepSzb mat, const int*    scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<float >(DevMem2Db mat, const float*  scalar, DevMem2Db mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<float >(PtrStepSzb mat, const float*  scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    template void set_to_gpu<double>(DevMem2Db mat, const double* scalar, DevMem2Db mask, int channels, cudaStream_t stream);
+    template void set_to_gpu<double>(PtrStepSzb mat, const double* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
    template <typename T>
-    void set_to_gpu(DevMem2Db mat, const T* scalar, int channels, cudaStream_t stream)
+    void set_to_gpu(PtrStepSzb mat, const T* scalar, int channels, cudaStream_t stream)
    {
        writeScalar(scalar);
@@ -201,13 +213,13 @@ namespace cv { namespace gpu { namespace device
            cudaSafeCall ( cudaDeviceSynchronize() );
    }
-    template void set_to_gpu<uchar >(DevMem2Db mat, const uchar*  scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<uchar >(PtrStepSzb mat, const uchar*  scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<schar >(DevMem2Db mat, const schar*  scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<schar >(PtrStepSzb mat, const schar*  scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<ushort>(DevMem2Db mat, const ushort* scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<ushort>(PtrStepSzb mat, const ushort* scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<short >(DevMem2Db mat, const short*  scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<short >(PtrStepSzb mat, const short*  scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<int   >(DevMem2Db mat, const int*    scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<int   >(PtrStepSzb mat, const int*    scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<float >(DevMem2Db mat, const float*  scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<float >(PtrStepSzb mat, const float*  scalar, int channels, cudaStream_t stream);
-    template void set_to_gpu<double>(DevMem2Db mat, const double* scalar, int channels, cudaStream_t stream);
+    template void set_to_gpu<double>(PtrStepSzb mat, const double* scalar, int channels, cudaStream_t stream);
    ///////////////////////////////////////////////////////////////////////////
    //////////////////////////////// ConvertTo ////////////////////////////////
@@ -274,12 +286,12 @@ namespace cv { namespace gpu { namespace device
    };
    template<typename T, typename D>
-    void cvt_(DevMem2Db src, DevMem2Db dst, double alpha, double beta, cudaStream_t stream)
+    void cvt_(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream)
    {
        cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
        cudaSafeCall( cudaSetDoubleForDevice(&beta) );
        Convertor<T, D> op(alpha, beta);
-        cv::gpu::device::transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, WithOutMask(), stream);
+        cv::gpu::device::transform((PtrStepSz<T>)src, (PtrStepSz<D>)dst, op, WithOutMask(), stream);
    }
 #if defined  __clang__
@@ -287,9 +299,9 @@ namespace cv { namespace gpu { namespace device
 # pragma clang diagnostic ignored "-Wmissing-declarations"
 #endif
-    void convert_gpu(DevMem2Db src, int sdepth, DevMem2Db dst, int ddepth, double alpha, double beta, cudaStream_t stream)
+    void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream)
    {
-        typedef void (*caller_t)(DevMem2Db src, DevMem2Db dst, double alpha, double beta, cudaStream_t stream);
+        typedef void (*caller_t)(PtrStepSzb src, PtrStepSzb dst, double alpha, double beta, cudaStream_t stream);
        static const caller_t tab[8][8] =
        {

--- a/modules/core/src/gpumat.cpp
+++ b/modules/core/src/gpumat.cpp
@@ -761,15 +761,15 @@ namespace
 namespace cv { namespace gpu { namespace device
 {
-    void copyToWithMask_gpu(DevMem2Db src, DevMem2Db dst, size_t elemSize1, int cn, DevMem2Db mask, bool colorMask, cudaStream_t stream);
+    void copyToWithMask_gpu(PtrStepSzb src, PtrStepSzb dst, size_t elemSize1, int cn, PtrStepSzb mask, bool colorMask, cudaStream_t stream);
    template <typename T>
-    void set_to_gpu(DevMem2Db mat, const T* scalar, int channels, cudaStream_t stream);
+    void set_to_gpu(PtrStepSzb mat, const T* scalar, int channels, cudaStream_t stream);
    template <typename T>
-    void set_to_gpu(DevMem2Db mat, const T* scalar, DevMem2Db mask, int channels, cudaStream_t stream);
+    void set_to_gpu(PtrStepSzb mat, const T* scalar, PtrStepSzb mask, int channels, cudaStream_t stream);
-    void convert_gpu(DevMem2Db src, int sdepth, DevMem2Db dst, int ddepth, double alpha, double beta, cudaStream_t stream);
+    void convert_gpu(PtrStepSzb src, int sdepth, PtrStepSzb dst, int ddepth, double alpha, double beta, cudaStream_t stream);
 }}}
 namespace
@@ -787,9 +787,22 @@ namespace
    }
 }
+namespace cv { namespace gpu
+{
+    CV_EXPORTS void copyWithMask(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, const cv::gpu::GpuMat&, CUstream_st*);
+    CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&);
+    CV_EXPORTS void convertTo(const cv::gpu::GpuMat&, cv::gpu::GpuMat&, double, double, CUstream_st*);
+    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, CUstream_st*);
+    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&, CUstream_st*);
+    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar);
+    CV_EXPORTS void setTo(cv::gpu::GpuMat&, cv::Scalar, const cv::gpu::GpuMat&);
+}}
 namespace cv { namespace gpu
 {
-    CV_EXPORTS void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0)
+    void copyWithMask(const GpuMat& src, GpuMat& dst, const GpuMat& mask, cudaStream_t stream = 0)
    {
        CV_Assert(src.size() == dst.size() && src.type() == dst.type());
        CV_Assert(src.size() == mask.size() && mask.depth() == CV_8U && (mask.channels() == 1 || mask.channels() == src.channels()));
@@ -797,17 +810,17 @@ namespace cv { namespace gpu
        cv::gpu::device::copyToWithMask_gpu(src.reshape(1), dst.reshape(1), src.elemSize1(), src.channels(), mask.reshape(1), mask.channels() != 1, stream);
    }
-    CV_EXPORTS void convertTo(const GpuMat& src, GpuMat& dst)
+    void convertTo(const GpuMat& src, GpuMat& dst)
    {
        cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), 1.0, 0.0, 0);
    }
-    CV_EXPORTS void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0)
+    void convertTo(const GpuMat& src, GpuMat& dst, double alpha, double beta, cudaStream_t stream = 0)
    {
        cv::gpu::device::convert_gpu(src.reshape(1), src.depth(), dst.reshape(1), dst.depth(), alpha, beta, stream);
    }
-    CV_EXPORTS void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
+    void setTo(GpuMat& src, Scalar s, cudaStream_t stream)
    {
        typedef void (*caller_t)(GpuMat& src, Scalar s, cudaStream_t stream);
@@ -820,7 +833,7 @@ namespace cv { namespace gpu
        callers[src.depth()](src, s, stream);
    }
-    CV_EXPORTS void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream)
    {
        typedef void (*caller_t)(GpuMat& src, Scalar s, const GpuMat& mask, cudaStream_t stream);
@@ -833,12 +846,12 @@ namespace cv { namespace gpu
        callers[src.depth()](src, s, mask, stream);
    }
-    CV_EXPORTS void setTo(GpuMat& src, Scalar s)
+    void setTo(GpuMat& src, Scalar s)
    {
        setTo(src, s, 0);
    }
-    CV_EXPORTS void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
+    void setTo(GpuMat& src, Scalar s, const GpuMat& mask)
    {
        setTo(src, s, mask, 0);
    }

--- a/modules/gpu/doc/data_structures.rst
+++ b/modules/gpu/doc/data_structures.rst
@@ -5,24 +5,24 @@ Data Structures
-gpu::DevMem2D\_
+gpu::PtrStepSz
 ---------------
-.. ocv:class:: gpu::DevMem2D\_
+.. ocv:class:: gpu::PtrStepSz
 Lightweight class encapsulating pitched memory on a GPU and passed to nvcc-compiled code (CUDA kernels). Typically, it is used internally by OpenCV and by users who write device code. You can call its members from both host and device code. ::
-    template <typename T> struct DevMem2D_
+    template <typename T> struct PtrStepSz
    {
        int cols;
        int rows;
        T* data;
        size_t step;
-        DevMem2D_() : cols(0), rows(0), data(0), step(0){};
+        PtrStepSz() : cols(0), rows(0), data(0), step(0){};
-        DevMem2D_(int rows, int cols, T *data, size_t step);
+        PtrStepSz(int rows, int cols, T *data, size_t step);
        template <typename U>
-        explicit DevMem2D_(const DevMem2D_<U>& d);
+        explicit PtrStepSz(const PtrStepSz<U>& d);
        typedef T elem_type;
        enum { elem_size = sizeof(elem_type) };
@@ -34,25 +34,25 @@ Lightweight class encapsulating pitched memory on a GPU and passed to nvcc-compi
        __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const;
    };
-    typedef DevMem2D_<unsigned char> DevMem2D;
+    typedef PtrStepSz<unsigned char> PtrStepSzb;
-    typedef DevMem2D_<float> DevMem2Df;
+    typedef PtrStepSz<float> PtrStepSzf;
-    typedef DevMem2D_<int> DevMem2Di;
+    typedef PtrStepSz<int> PtrStepSzi;
-gpu::PtrStep\_
+gpu::PtrStep
 --------------
-.. ocv:class:: gpu::PtrStep\_
+.. ocv:class:: gpu::PtrStep
-Structure similar to :ocv:class:`gpu::DevMem2D_` but containing only a pointer and row step. Width and height fields are excluded due to performance reasons. The structure is intended for internal use or for users who write device code. ::
+Structure similar to :ocv:class:`gpu::PtrStepSz` but containing only a pointer and row step. Width and height fields are excluded due to performance reasons. The structure is intended for internal use or for users who write device code. ::
-    template<typename T> struct PtrStep_
+    template<typename T> struct PtrStep
    {
            T* data;
            size_t step;
-            PtrStep_();
+            PtrStep();
-            PtrStep_(const DevMem2D_<T>& mem);
+            PtrStep(const PtrStepSz<T>& mem);
            typedef T elem_type;
            enum { elem_size = sizeof(elem_type) };
@@ -62,25 +62,9 @@ Structure similar to :ocv:class:`gpu::DevMem2D_` but containing only a pointer a
            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const;
    };
-    typedef PtrStep_<unsigned char> PtrStep;
+    typedef PtrStep<unsigned char> PtrStep;
-    typedef PtrStep_<float> PtrStepf;
+    typedef PtrStep<float> PtrStepf;
-    typedef PtrStep_<int> PtrStepi;
+    typedef PtrStep<int> PtrStepi;
-gpu::PtrElemStep\_
------------------
-.. ocv:class:: gpu::PtrElemStep\_
-Structure similar to :ocv:class:`gpu::DevMem2D_` but containing only a pointer and a row step in elements. Width and height fields are excluded due to performance reasons. This class can only be constructed if ``sizeof(T)`` is a multiple of 256. The structure is intended for internal use or for users who write device code. ::
-    template<typename T> struct PtrElemStep_ : public PtrStep_<T>
-    {
-            PtrElemStep_(const DevMem2D_<T>& mem);
-            __CV_GPU_HOST_DEVICE__ T* ptr(int y = 0);
-            __CV_GPU_HOST_DEVICE__ const T* ptr(int y = 0) const;
-    };
 gpu::GpuMat
@@ -93,7 +77,7 @@ Base storage class for GPU memory with reference counting. Its interface matches
 * no functions that return references to their data (because references on GPU are not valid for CPU)
 * no expression templates technique support
-Beware that the latter limitation may lead to overloaded matrix operators that cause memory allocations. The ``GpuMat`` class is convertible to :ocv:class:`gpu::DevMem2D_` and :ocv:class:`gpu::PtrStep_` so it can be passed directly to the kernel.
+Beware that the latter limitation may lead to overloaded matrix operators that cause memory allocations. The ``GpuMat`` class is convertible to :ocv:class:`gpu::PtrStepSz` and :ocv:class:`gpu::PtrStep` so it can be passed directly to the kernel.
 .. note:: In contrast with :ocv:class:`Mat`, in most cases ``GpuMat::isContinuous() == false`` . This means that rows are aligned to a size depending on the hardware. Single-row ``GpuMat`` is always a continuous matrix.
@@ -113,10 +97,10 @@ Beware that the latter limitation may lead to overloaded matrix operators that c
            //! builds GpuMat from Mat. Blocks uploading to device.
            explicit GpuMat (const Mat& m);
-            //! returns lightweight DevMem2D_ structure for passing
+            //! returns lightweight PtrStepSz structure for passing
            //to nvcc-compiled code. Contains size, data ptr and step.
-            template <class T> operator DevMem2D_<T>() const;
+            template <class T> operator PtrStepSz<T>() const;
-            template <class T> operator PtrStep_<T>() const;
+            template <class T> operator PtrStep<T>() const;
            //! blocks uploading data to GpuMat.
            void upload(const cv::Mat& m);

--- a/modules/gpu/include/opencv2/gpu/devmem2d.hpp
+++ b/modules/gpu/include/opencv2/gpu/devmem2d.hpp
@@ -40,4 +40,4 @@
 //
 //M*/
-#include "opencv2/core/devmem2d.hpp"
+#include "opencv2/core/cuda_devptrs.hpp"
--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -454,8 +454,8 @@ namespace cv { namespace gpu { namespace device
 {
    namespace mathfunc
    {
-        void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
+        void cartToPolar_gpu(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, bool magSqr, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream);
-        void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
+        void polarToCart_gpu(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream);
    }
 }}}

--- a/modules/gpu/src/bgfg_gmg.cpp
+++ b/modules/gpu/src/bgfg_gmg.cpp
@@ -58,7 +58,7 @@ namespace cv { namespace gpu { namespace device {
                           float decisionThreshold, int maxFeatures, int numInitializationFrames);
        template <typename SrcT>
-        void update_gpu(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures,
+        void update_gpu(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures,
                        int frameNum,  float learningRate, bool updateBackgroundModel, cudaStream_t stream);
    }
 }}}
@@ -109,7 +109,7 @@ void cv::gpu::GMG_GPU::operator ()(const cv::gpu::GpuMat& frame, cv::gpu::GpuMat
 {
    using namespace cv::gpu::device::bgfg_gmg;
-    typedef void (*func_t)(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures,
+    typedef void (*func_t)(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures,
                           int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
    static const func_t funcs[6][4] =
    {

--- a/modules/gpu/src/bgfg_mog.cpp
+++ b/modules/gpu/src/bgfg_mog.cpp
@@ -62,14 +62,14 @@ namespace cv { namespace gpu { namespace device
 {
    namespace mog
    {
-        void mog_gpu(DevMem2Db frame, int cn, DevMem2Db fgmask, DevMem2Df weight, DevMem2Df sortKey, DevMem2Db mean, DevMem2Db var,
+        void mog_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzf weight, PtrStepSzf sortKey, PtrStepSzb mean, PtrStepSzb var,
                     int nmixtures, float varThreshold, float learningRate, float backgroundRatio, float noiseSigma,
                     cudaStream_t stream);
-        void getBackgroundImage_gpu(int cn, DevMem2Df weight, DevMem2Db mean, DevMem2Db dst, int nmixtures, float backgroundRatio, cudaStream_t stream);
+        void getBackgroundImage_gpu(int cn, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, int nmixtures, float backgroundRatio, cudaStream_t stream);
        void loadConstants(int nmixtures, float Tb, float TB, float Tg, float varInit, float varMin, float varMax, float tau, unsigned char shadowVal);
-        void mog2_gpu(DevMem2Db frame, int cn, DevMem2Db fgmask, DevMem2Db modesUsed, DevMem2Df weight, DevMem2Df variance, DevMem2Db mean, float alphaT, float prune, bool detectShadows, cudaStream_t stream);
+        void mog2_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzf variance, PtrStepSzb mean, float alphaT, float prune, bool detectShadows, cudaStream_t stream);
-        void getBackgroundImage2_gpu(int cn, DevMem2Db modesUsed, DevMem2Df weight, DevMem2Db mean, DevMem2Db dst, cudaStream_t stream);
+        void getBackgroundImage2_gpu(int cn, PtrStepSzb modesUsed, PtrStepSzf weight, PtrStepSzb mean, PtrStepSzb dst, cudaStream_t stream);
    }
 }}}

--- a/modules/gpu/src/bgfg_vibe.cpp
+++ b/modules/gpu/src/bgfg_vibe.cpp
@@ -57,9 +57,9 @@ namespace cv { namespace gpu { namespace device
    {
        void loadConstants(int nbSamples, int reqMatches, int radius, int subsamplingFactor);
-        void init_gpu(DevMem2Db frame, int cn, DevMem2Db samples, DevMem2D_<unsigned int> randStates, cudaStream_t stream);
+        void init_gpu(PtrStepSzb frame, int cn, PtrStepSzb samples, PtrStepSz<unsigned int> randStates, cudaStream_t stream);
-        void update_gpu(DevMem2Db frame, int cn, DevMem2Db fgmask, DevMem2Db samples, DevMem2D_<unsigned int> randStates, cudaStream_t stream);
+        void update_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<unsigned int> randStates, cudaStream_t stream);
    }
 }}}

--- a/modules/gpu/src/bilateral_filter.cpp
+++ b/modules/gpu/src/bilateral_filter.cpp
@@ -59,10 +59,10 @@ namespace cv { namespace gpu { namespace device
 {
    namespace bilateral_filter
    {
-        void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc);
+        void load_constants(float* table_color, PtrStepSzf table_space, int ndisp, int radius, short edge_disc, short max_disc);
-        void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);
+        void bilateral_filter_gpu(PtrStepSzb disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
-        void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream);
+        void bilateral_filter_gpu(PtrStepSz<short> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream);
    }
 }}}
@@ -120,7 +120,7 @@ namespace
                disp.copyTo(dst);
        }
-        bilateral_filter_gpu((DevMem2D_<T>)dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
+        bilateral_filter_gpu((PtrStepSz<T>)dst, img, img.channels(), iters, StreamAccessor::getStream(stream));
    }
    typedef void (*bilateral_filter_operator_t)(int ndisp, int radius, int iters, float edge_threshold, float max_disc_threshold, 

--- a/modules/gpu/src/brute_force_matcher.cpp
+++ b/modules/gpu/src/brute_force_matcher.cpp
--- a/modules/gpu/src/calib3d.cpp
+++ b/modules/gpu/src/calib3d.cpp
@@ -60,12 +60,12 @@ namespace cv { namespace gpu { namespace device
 {
    namespace transform_points
    {
-        void call(const DevMem2D_<float3> src, const float* rot, const float* transl, DevMem2D_<float3> dst, cudaStream_t stream);
+        void call(const PtrStepSz<float3> src, const float* rot, const float* transl, PtrStepSz<float3> dst, cudaStream_t stream);
    }
    namespace project_points
    {
-        void call(const DevMem2D_<float3> src, const float* rot, const float* transl, const float* proj, DevMem2D_<float2> dst, cudaStream_t stream);
+        void call(const PtrStepSz<float3> src, const float* rot, const float* transl, const float* proj, PtrStepSz<float2> dst, cudaStream_t stream);
    }
    namespace solve_pnp_ransac

--- a/modules/gpu/src/cascadeclassifier.cpp
+++ b/modules/gpu/src/cascadeclassifier.cpp
@@ -352,18 +352,18 @@ namespace cv { namespace gpu { namespace device
                             float initalScale,
                             float factor,
                             int total,
-                             const DevMem2Db& mstages,
+                             const PtrStepSzb& mstages,
                             const int nstages,
-                             const DevMem2Di& mnodes,
+                             const PtrStepSzi& mnodes,
-                             const DevMem2Df& mleaves,
+                             const PtrStepSzf& mleaves,
-                             const DevMem2Di& msubsets,
+                             const PtrStepSzi& msubsets,
-                             const DevMem2Db& mfeatures,
+                             const PtrStepSzb& mfeatures,
                             const int subsetSize,
-                             DevMem2D_<int4> objects,
+                             PtrStepSz<int4> objects,
                             unsigned int* classified,
-                             DevMem2Di integral);
+                             PtrStepSzi integral);
-        void connectedConmonents(DevMem2D_<int4>  candidates, int ncandidates, DevMem2D_<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
+        void connectedConmonents(PtrStepSz<int4>  candidates, int ncandidates, PtrStepSz<int4> objects,int groupThreshold, float grouping_eps, unsigned int* nclasses);
    }
 }}}

--- a/modules/gpu/src/color.cpp
+++ b/modules/gpu/src/color.cpp
@@ -59,9 +59,9 @@ namespace cv { namespace gpu {
    namespace device
    {
        template <int cn>
-        void Bayer2BGR_8u_gpu(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
        template <int cn>
-        void Bayer2BGR_16u_gpu(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
    }
 }}
@@ -69,7 +69,7 @@ using namespace ::cv::gpu::device;
 namespace
 {
-    typedef void (*gpu_func_t)(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
+    typedef void (*gpu_func_t)(const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
    void bgr_to_rgb(const GpuMat& src, GpuMat& dst, int, Stream& stream)
    {
@@ -1336,7 +1336,7 @@ namespace
    void bayer_to_bgr(const GpuMat& src, GpuMat& dst, int dcn, bool blue_last, bool start_with_green, Stream& stream)
    {
-        typedef void (*func_t)(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
        static const func_t funcs[3][4] =
        {
            {0,0,Bayer2BGR_8u_gpu<3>, Bayer2BGR_8u_gpu<4>},

--- a/modules/gpu/src/cuda/NV12ToARGB.cu
+++ b/modules/gpu/src/cuda/NV12ToARGB.cu
@@ -191,7 +191,7 @@ namespace cv { namespace gpu { namespace device {
            dstImage[y * dstImagePitch + x + 1 ] = RGBAPACK_10bit(red[1], green[1], blue[1], constAlpha);
        }
-        void NV12ToARGB_gpu(const PtrStepb decodedFrame, DevMem2D_<uint> interopFrame, cudaStream_t stream)
+        void NV12ToARGB_gpu(const PtrStepb decodedFrame, PtrStepSz<uint> interopFrame, cudaStream_t stream)
        {
            dim3 block(32, 8);
            dim3 grid(divUp(interopFrame.cols, 2 * block.x), divUp(interopFrame.rows, block.y));

--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
--- a/modules/gpu/src/cuda/bgfg_gmg.cu
+++ b/modules/gpu/src/cuda/bgfg_gmg.cu
@@ -168,7 +168,7 @@ namespace cv { namespace gpu { namespace device {
        template <typename T> struct Quantization : detail::Quantization<VecTraits<T>::cn> {};
        template <typename SrcT>
-        __global__ void update(const PtrStep_<SrcT> frame, PtrStepb fgmask, PtrStepi colors_, PtrStepf weights_, PtrStepi nfeatures_,
+        __global__ void update(const PtrStep<SrcT> frame, PtrStepb fgmask, PtrStepi colors_, PtrStepf weights_, PtrStepi nfeatures_,
                               const int frameNum, const float learningRate, const bool updateBackgroundModel)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -222,7 +222,7 @@ namespace cv { namespace gpu { namespace device {
        }
        template <typename SrcT>
-        void update_gpu(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures,
+        void update_gpu(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures,
                        int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream)
        {
            const dim3 block(32, 8);
@@ -230,7 +230,7 @@ namespace cv { namespace gpu { namespace device {
            cudaSafeCall( cudaFuncSetCacheConfig(update<SrcT>, cudaFuncCachePreferL1) );
-            update<SrcT><<<grid, block, 0, stream>>>((DevMem2D_<SrcT>) frame, fgmask, colors, weights, nfeatures, frameNum, learningRate, updateBackgroundModel);
+            update<SrcT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, colors, weights, nfeatures, frameNum, learningRate, updateBackgroundModel);
            cudaSafeCall( cudaGetLastError() );
@@ -238,16 +238,16 @@ namespace cv { namespace gpu { namespace device {
                cudaSafeCall( cudaDeviceSynchronize() );
        }
-        template void update_gpu<uchar  >(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<uchar  >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<uchar3 >(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<uchar3 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<uchar4 >(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<uchar4 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<ushort >(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<ushort >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<ushort3>(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<ushort3>(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<ushort4>(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<ushort4>(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<float  >(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<float  >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<float3 >(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<float3 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
-        template void update_gpu<float4 >(DevMem2Db frame, PtrStepb fgmask, DevMem2Di colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
+        template void update_gpu<float4 >(PtrStepSzb frame, PtrStepb fgmask, PtrStepSzi colors, PtrStepf weights, PtrStepi nfeatures, int frameNum, float learningRate, bool updateBackgroundModel, cudaStream_t stream);
    }
 }}}
--- a/modules/gpu/src/cuda/bgfg_mog.cu
+++ b/modules/gpu/src/cuda/bgfg_mog.cu
--- a/modules/gpu/src/cuda/bgfg_vibe.cu
+++ b/modules/gpu/src/cuda/bgfg_vibe.cu
@@ -90,7 +90,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <typename SrcT, typename SampleT>
-        __global__ void init(const DevMem2D_<SrcT> frame, PtrStep_<SampleT> samples, PtrStep_<uint> randStates)
+        __global__ void init(const PtrStepSz<SrcT> frame, PtrStep<SampleT> samples, PtrStep<uint> randStates)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -116,23 +116,23 @@ namespace cv { namespace gpu { namespace device
        }
        template <typename SrcT, typename SampleT>
-        void init_caller(DevMem2Db frame, DevMem2Db samples, DevMem2D_<uint> randStates, cudaStream_t stream)
+        void init_caller(PtrStepSzb frame, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
        {
            dim3 block(32, 8);
            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
            cudaSafeCall( cudaFuncSetCacheConfig(init<SrcT, SampleT>, cudaFuncCachePreferL1) );
-            init<SrcT, SampleT><<<grid, block, 0, stream>>>((DevMem2D_<SrcT>) frame, (DevMem2D_<SampleT>) samples, randStates);
+            init<SrcT, SampleT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, (PtrStepSz<SampleT>) samples, randStates);
            cudaSafeCall( cudaGetLastError() );
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
-        void init_gpu(DevMem2Db frame, int cn, DevMem2Db samples, DevMem2D_<uint> randStates, cudaStream_t stream)
+        void init_gpu(PtrStepSzb frame, int cn, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
        {
-            typedef void (*func_t)(DevMem2Db frame, DevMem2Db samples, DevMem2D_<uint> randStates, cudaStream_t stream);
+            typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream);
            static const func_t funcs[] =
            {
                0, init_caller<uchar, uchar>, 0, init_caller<uchar3, uchar4>, init_caller<uchar4, uchar4>
@@ -155,7 +155,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <typename SrcT, typename SampleT>
-        __global__ void update(const DevMem2D_<SrcT> frame, PtrStepb fgmask, PtrStep_<SampleT> samples, PtrStep_<uint> randStates)
+        __global__ void update(const PtrStepSz<SrcT> frame, PtrStepb fgmask, PtrStep<SampleT> samples, PtrStep<uint> randStates)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -225,23 +225,23 @@ namespace cv { namespace gpu { namespace device
        }
        template <typename SrcT, typename SampleT>
-        void update_caller(DevMem2Db frame, DevMem2Db fgmask, DevMem2Db samples, DevMem2D_<uint> randStates, cudaStream_t stream)
+        void update_caller(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
        {
            dim3 block(32, 8);
            dim3 grid(divUp(frame.cols, block.x), divUp(frame.rows, block.y));
            cudaSafeCall( cudaFuncSetCacheConfig(update<SrcT, SampleT>, cudaFuncCachePreferL1) );
-            update<SrcT, SampleT><<<grid, block, 0, stream>>>((DevMem2D_<SrcT>) frame, fgmask, (DevMem2D_<SampleT>) samples, randStates);
+            update<SrcT, SampleT><<<grid, block, 0, stream>>>((PtrStepSz<SrcT>) frame, fgmask, (PtrStepSz<SampleT>) samples, randStates);
            cudaSafeCall( cudaGetLastError() );
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
-        void update_gpu(DevMem2Db frame, int cn, DevMem2Db fgmask, DevMem2Db samples, DevMem2D_<uint> randStates, cudaStream_t stream)
+        void update_gpu(PtrStepSzb frame, int cn, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream)
        {
-            typedef void (*func_t)(DevMem2Db frame, DevMem2Db fgmask, DevMem2Db samples, DevMem2D_<uint> randStates, cudaStream_t stream);
+            typedef void (*func_t)(PtrStepSzb frame, PtrStepSzb fgmask, PtrStepSzb samples, PtrStepSz<uint> randStates, cudaStream_t stream);
            static const func_t funcs[] =
            {
                0, update_caller<uchar, uchar>, 0, update_caller<uchar3, uchar4>, update_caller<uchar4, uchar4>

--- a/modules/gpu/src/cuda/bilateral_filter.cu
+++ b/modules/gpu/src/cuda/bilateral_filter.cu
@@ -57,7 +57,7 @@ namespace cv { namespace gpu { namespace device
        __constant__ short cedge_disc;
        __constant__ short cmax_disc;
-        void load_constants(float* table_color, DevMem2Df table_space, int ndisp, int radius, short edge_disc, short max_disc)
+        void load_constants(float* table_color, PtrStepSzf table_space, int ndisp, int radius, short edge_disc, short max_disc)
        {
            cudaSafeCall( cudaMemcpyToSymbol(ctable_color, &table_color, sizeof(table_color)) );
            cudaSafeCall( cudaMemcpyToSymbol(ctable_space, &table_space.data, sizeof(table_space.data)) );
@@ -176,7 +176,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <typename T>
-        void bilateral_filter_caller(DevMem2D_<T> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
+        void bilateral_filter_caller(PtrStepSz<T> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream)
        {
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
@@ -213,12 +213,12 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }
-        void bilateral_filter_gpu(DevMem2Db disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
+        void bilateral_filter_gpu(PtrStepSzb disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream)
        {
            bilateral_filter_caller(disp, img, channels, iters, stream);
        }
-        void bilateral_filter_gpu(DevMem2D_<short> disp, DevMem2Db img, int channels, int iters, cudaStream_t stream)
+        void bilateral_filter_gpu(PtrStepSz<short> disp, PtrStepSzb img, int channels, int iters, cudaStream_t stream)
        {
            bilateral_filter_caller(disp, img, channels, iters, stream);
        }

--- a/modules/gpu/src/cuda/calib3d.cu
+++ b/modules/gpu/src/cuda/calib3d.cu
@@ -66,8 +66,8 @@ namespace cv { namespace gpu { namespace device
            }
        };
-        void call(const DevMem2D_<float3> src, const float* rot,
+        void call(const PtrStepSz<float3> src, const float* rot,
-                  const float* transl, DevMem2D_<float3> dst,
+                  const float* transl, PtrStepSz<float3> dst,
                  cudaStream_t stream)
        {
            cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));
@@ -103,8 +103,8 @@ namespace cv { namespace gpu { namespace device
            }
        };
-        void call(const DevMem2D_<float3> src, const float* rot,
+        void call(const PtrStepSz<float3> src, const float* rot,
-                  const float* transl, const float* proj, DevMem2D_<float2> dst,
+                  const float* transl, const float* proj, PtrStepSz<float2> dst,
                  cudaStream_t stream)
        {
            cudaSafeCall(cudaMemcpyToSymbol(crot0, rot, sizeof(float) * 3));

--- a/modules/gpu/src/cuda/ccomponetns.cu
+++ b/modules/gpu/src/cuda/ccomponetns.cu
@@ -176,7 +176,7 @@ namespace cv { namespace gpu { namespace device
        template<typename T, typename F>
-        __global__ void computeConnectivity(const DevMem2D_<T> image, DevMem2D components, F connected)
+        __global__ void computeConnectivity(const PtrStepSz<T> image, PtrStepSzb components, F connected)
        {
            int x = threadIdx.x + blockIdx.x * blockDim.x;
            int y = threadIdx.y + blockIdx.y * blockDim.y;
@@ -202,7 +202,7 @@ namespace cv { namespace gpu { namespace device
        }
        template< typename T>
-        void computeEdges(const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream)
+        void computeEdges(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream)
        {
            dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
            dim3 grid(divUp(image.cols, block.x), divUp(image.rows, block.y));
@@ -210,23 +210,23 @@ namespace cv { namespace gpu { namespace device
            typedef InInterval<typename IntervalsTraits<T>::dist_type, IntervalsTraits<T>::ch> Int_t;
            Int_t inInt(lo, hi);
-            computeConnectivity<T, Int_t><<<grid, block, 0, stream>>>(static_cast<const DevMem2D_<T> >(image), edges, inInt);
+            computeConnectivity<T, Int_t><<<grid, block, 0, stream>>>(static_cast<const PtrStepSz<T> >(image), edges, inInt);
            cudaSafeCall( cudaGetLastError() );
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
-        template void computeEdges<uchar>  (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<uchar>  (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<uchar3> (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<uchar3> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<uchar4> (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<uchar4> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<ushort> (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<ushort> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<ushort3>(const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<ushort3>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<ushort4>(const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<ushort4>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<int>    (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<int>    (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        template void computeEdges<float>  (const DevMem2D& image, DevMem2D edges, const float4& lo, const float4& hi, cudaStream_t stream);
+        template void computeEdges<float>  (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
-        __global__ void lableTiles(const DevMem2D edges, DevMem2Di comps)
+        __global__ void lableTiles(const PtrStepSzb edges, PtrStepSzi comps)
        {
            int x = threadIdx.x + blockIdx.x * TILE_COLS;
            int y = threadIdx.y + blockIdx.y * TILE_ROWS;
@@ -360,7 +360,7 @@ namespace cv { namespace gpu { namespace device
                }
        }
-        __device__ __forceinline__ int root(const DevMem2Di& comps, int label)
+        __device__ __forceinline__ int root(const PtrStepSzi& comps, int label)
        {
            while(1)
            {
@@ -376,7 +376,7 @@ namespace cv { namespace gpu { namespace device
            return label;
        }
-        __device__ __forceinline__ void isConnected(DevMem2Di& comps, int l1, int l2, bool& changed)
+        __device__ __forceinline__ void isConnected(PtrStepSzi& comps, int l1, int l2, bool& changed)
        {
            int r1 = root(comps, l1);
            int r2 = root(comps, l2);
@@ -394,7 +394,7 @@ namespace cv { namespace gpu { namespace device
        }
        __global__ void crossMerge(const int tilesNumY, const int tilesNumX, int tileSizeY, int tileSizeX,
-            const DevMem2D edges, DevMem2Di comps, const int yIncomplete, int xIncomplete)
+            const PtrStepSzb edges, PtrStepSzi comps, const int yIncomplete, int xIncomplete)
        {
            int tid = threadIdx.y * blockDim.x + threadIdx.x;
            int stride = blockDim.y * blockDim.x;
@@ -482,7 +482,7 @@ namespace cv { namespace gpu { namespace device
            } while (Emulation::syncthreadsOr(changed));
        }
-        __global__ void flatten(const DevMem2D edges, DevMem2Di comps)
+        __global__ void flatten(const PtrStepSzb edges, PtrStepSzi comps)
        {
            int x = threadIdx.x + blockIdx.x * blockDim.x;
            int y = threadIdx.y + blockIdx.y * blockDim.y;
@@ -493,7 +493,7 @@ namespace cv { namespace gpu { namespace device
        enum {CC_NO_COMPACT = 0, CC_COMPACT_LABELS = 1};
-        void labelComponents(const DevMem2D& edges, DevMem2Di comps, int flags, cudaStream_t stream)
+        void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream)
        {
            dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
            dim3 grid(divUp(edges.cols, TILE_COLS), divUp(edges.rows, TILE_ROWS));

--- a/modules/gpu/src/cuda/color.cu
+++ b/modules/gpu/src/cuda/color.cu
@@ -222,12 +222,12 @@ namespace cv { namespace gpu { namespace device
    };
 #define OPENCV_GPU_IMPLEMENT_CVTCOLOR(name, traits) \
-    void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream) \
+    void name(const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream) \
    { \
        traits::functor_type functor = traits::create_functor(); \
        typedef typename traits::functor_type::argument_type src_t; \
        typedef typename traits::functor_type::result_type   dst_t; \
-        cv::gpu::device::transform((DevMem2D_<src_t>)src, (DevMem2D_<dst_t>)dst, functor, WithOutMask(), stream); \
+        cv::gpu::device::transform((PtrStepSz<src_t>)src, (PtrStepSz<dst_t>)dst, functor, WithOutMask(), stream); \
    }
 #define OPENCV_GPU_IMPLEMENT_CVTCOLOR_ONE(name) \

--- a/modules/gpu/src/cuda/column_filter.cu
+++ b/modules/gpu/src/cuda/column_filter.cu
@@ -62,7 +62,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int KSIZE, typename T, typename D, typename B>
-        __global__ void linearColumnFilter(const DevMem2D_<T> src, PtrStep<D> dst, const int anchor, const B brd)
+        __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
        {
            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
                const int BLOCK_DIM_X = 16;
@@ -125,7 +125,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int KSIZE, typename T, typename D, template<typename> class B>
-        void linearColumnFilter_caller(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream)
+        void linearColumnFilter_caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
        {
            int BLOCK_DIM_X;
            int BLOCK_DIM_Y;
@@ -158,9 +158,9 @@ namespace cv { namespace gpu { namespace device
        }
        template <typename T, typename D>
-        void linearColumnFilter_gpu(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
+        void linearColumnFilter_gpu(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
        {
-            typedef void (*caller_t)(DevMem2D_<T> src, DevMem2D_<D> dst, int anchor, int cc, cudaStream_t stream);
+            typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
            static const caller_t callers[5][33] =
            {
@@ -343,13 +343,13 @@ namespace cv { namespace gpu { namespace device
            loadKernel(kernel, ksize);
-            callers[brd_type][ksize]((DevMem2D_<T>)src, (DevMem2D_<D>)dst, anchor, cc, stream);
+            callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
        }
-        template void linearColumnFilter_gpu<float , uchar >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float , uchar >(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float4, uchar4>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float4, uchar4>(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float3, short3>(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float3, short3>(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float , int   >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float , int   >(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
-        template void linearColumnFilter_gpu<float , float >(DevMem2Db src, DevMem2Db dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
+        template void linearColumnFilter_gpu<float , float >(PtrStepSzb src, PtrStepSzb dst, const float kernel[], int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);
    } // namespace column_filter
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/copy_make_border.cu
+++ b/modules/gpu/src/cuda/copy_make_border.cu
--- a/modules/gpu/src/cuda/debayer.cu
+++ b/modules/gpu/src/cuda/debayer.cu
@@ -49,7 +49,7 @@ namespace cv { namespace gpu {
    namespace device
    {
        template <typename D>
-        __global__ void Bayer2BGR_8u(const PtrStepb src, DevMem2D_<D> dst, const bool blue_last, const bool start_with_green)
+        __global__ void Bayer2BGR_8u(const PtrStepb src, PtrStepSz<D> dst, const bool blue_last, const bool start_with_green)
        {
            const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
            int s_y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -193,7 +193,7 @@ namespace cv { namespace gpu {
        }
        template <typename D>
-        __global__ void Bayer2BGR_16u(const PtrStepb src, DevMem2D_<D> dst, const bool blue_last, const bool start_with_green)
+        __global__ void Bayer2BGR_16u(const PtrStepb src, PtrStepSz<D> dst, const bool blue_last, const bool start_with_green)
        {
            const int s_x = blockIdx.x * blockDim.x + threadIdx.x;
            int s_y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -287,7 +287,7 @@ namespace cv { namespace gpu {
        }
        template <int cn>
-        void Bayer2BGR_8u_gpu(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream)
+        void Bayer2BGR_8u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
        {
            typedef typename TypeVec<uchar, cn>::vec_type dst_t;
@@ -296,14 +296,14 @@ namespace cv { namespace gpu {
            cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_8u<dst_t>, cudaFuncCachePreferL1) );
-            Bayer2BGR_8u<dst_t><<<grid, block, 0, stream>>>(src, (DevMem2D_<dst_t>)dst, blue_last, start_with_green);
+            Bayer2BGR_8u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
            cudaSafeCall( cudaGetLastError() );
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
        template <int cn>
-        void Bayer2BGR_16u_gpu(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream)
+        void Bayer2BGR_16u_gpu(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream)
        {
            typedef typename TypeVec<ushort, cn>::vec_type dst_t;
@@ -312,16 +312,16 @@ namespace cv { namespace gpu {
            cudaSafeCall( cudaFuncSetCacheConfig(Bayer2BGR_16u<dst_t>, cudaFuncCachePreferL1) );
-            Bayer2BGR_16u<dst_t><<<grid, block, 0, stream>>>(src, (DevMem2D_<dst_t>)dst, blue_last, start_with_green);
+            Bayer2BGR_16u<dst_t><<<grid, block, 0, stream>>>(src, (PtrStepSz<dst_t>)dst, blue_last, start_with_green);
            cudaSafeCall( cudaGetLastError() );
            if (stream == 0)
                cudaSafeCall( cudaDeviceSynchronize() );
        }
-        template void Bayer2BGR_8u_gpu<3>(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        template void Bayer2BGR_8u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-        template void Bayer2BGR_8u_gpu<4>(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        template void Bayer2BGR_8u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-        template void Bayer2BGR_16u_gpu<3>(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        template void Bayer2BGR_16u_gpu<3>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
-        template void Bayer2BGR_16u_gpu<4>(DevMem2Db src, DevMem2Db dst, bool blue_last, bool start_with_green, cudaStream_t stream);
+        template void Bayer2BGR_16u_gpu<4>(PtrStepSzb src, PtrStepSzb dst, bool blue_last, bool start_with_green, cudaStream_t stream);
    }
 }}
--- a/modules/gpu/src/cuda/element_operations.cu
+++ b/modules/gpu/src/cuda/element_operations.cu
--- a/modules/gpu/src/cuda/fast.cu
+++ b/modules/gpu/src/cuda/fast.cu
@@ -221,7 +221,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <bool calcScore, class Mask>
-        __global__ void calcKeypoints(const DevMem2Db img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold)
+        __global__ void calcKeypoints(const PtrStepSzb img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold)
        {
            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
@@ -282,7 +282,7 @@ namespace cv { namespace gpu { namespace device
            #endif
        }
-        int calcKeypoints_gpu(DevMem2Db img, DevMem2Db mask, short2* kpLoc, int maxKeypoints, DevMem2Di score, int threshold)
+        int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold)
        {
            void* counter_ptr;
            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
@@ -323,7 +323,7 @@ namespace cv { namespace gpu { namespace device
        ///////////////////////////////////////////////////////////////////////////
        // nonmaxSupression
-        __global__ void nonmaxSupression(const short2* kpLoc, int count, const DevMem2Di scoreMat, short2* locFinal, float* responseFinal)
+        __global__ void nonmaxSupression(const short2* kpLoc, int count, const PtrStepSzi scoreMat, short2* locFinal, float* responseFinal)
        {
            #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
@@ -359,7 +359,7 @@ namespace cv { namespace gpu { namespace device
            #endif
        }
-        int nonmaxSupression_gpu(const short2* kpLoc, int count, DevMem2Di score, short2* loc, float* response)
+        int nonmaxSupression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response)
        {
            void* counter_ptr;
            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );

--- a/modules/gpu/src/cuda/fgd_bgfg.cu
+++ b/modules/gpu/src/cuda/fgd_bgfg.cu
--- a/modules/gpu/src/cuda/fgd_bgfg_common.hpp
+++ b/modules/gpu/src/cuda/fgd_bgfg_common.hpp
 #ifndef __FGD_BGFG_COMMON_HPP__
 #define __FGD_BGFG_COMMON_HPP__
-#include "opencv2/core/devmem2d.hpp"
+#include "opencv2/core/cuda_devptrs.hpp"
 namespace bgfg
 {
@@ -122,24 +122,24 @@ namespace bgfg
    const int HISTOGRAM_BIN_COUNT = 256;
    template <typename PT, typename CT>
-    void calcDiffHistogram_gpu(cv::gpu::DevMem2Db prevFrame, cv::gpu::DevMem2Db curFrame,
+    void calcDiffHistogram_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
                               unsigned int* hist0, unsigned int* hist1, unsigned int* hist2,
                               unsigned int* partialBuf0, unsigned int* partialBuf1, unsigned int* partialBuf2,
                               int cc, cudaStream_t stream);
    template <typename PT, typename CT>
-    void calcDiffThreshMask_gpu(cv::gpu::DevMem2Db prevFrame, cv::gpu::DevMem2Db curFrame, uchar3 bestThres, cv::gpu::DevMem2Db changeMask, cudaStream_t stream);
+    void calcDiffThreshMask_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame, uchar3 bestThres, cv::gpu::PtrStepSzb changeMask, cudaStream_t stream);
    void setBGPixelStat(const BGPixelStat& stat);
    template <typename PT, typename CT, typename OT>
-    void bgfgClassification_gpu(cv::gpu::DevMem2Db prevFrame, cv::gpu::DevMem2Db curFrame,
+    void bgfgClassification_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
-                                cv::gpu::DevMem2Db Ftd, cv::gpu::DevMem2Db Fbd, cv::gpu::DevMem2Db foreground,
+                                cv::gpu::PtrStepSzb Ftd, cv::gpu::PtrStepSzb Fbd, cv::gpu::PtrStepSzb foreground,
                                int deltaC, int deltaCC, float alpha2, int N1c, int N1cc, cudaStream_t stream);
    template <typename PT, typename CT, typename OT>
-    void updateBackgroundModel_gpu(cv::gpu::DevMem2Db prevFrame, cv::gpu::DevMem2Db curFrame,
+    void updateBackgroundModel_gpu(cv::gpu::PtrStepSzb prevFrame, cv::gpu::PtrStepSzb curFrame,
-                                   cv::gpu::DevMem2Db Ftd, cv::gpu::DevMem2Db Fbd, cv::gpu::DevMem2Db foreground, cv::gpu::DevMem2Db background,
+                                   cv::gpu::PtrStepSzb Ftd, cv::gpu::PtrStepSzb Fbd, cv::gpu::PtrStepSzb foreground, cv::gpu::PtrStepSzb background,
                                   int deltaC, int deltaCC, float alpha1, float alpha2, float alpha3, int N1c, int N1cc, int N2c, int N2cc, float T,
                                   cudaStream_t stream);
 }

--- a/modules/gpu/src/cuda/gftt.cu
+++ b/modules/gpu/src/cuda/gftt.cu
@@ -97,7 +97,7 @@ namespace cv { namespace gpu { namespace device
            #endif // __CUDA_ARCH__ >= 110
        }
-        int findCorners_gpu(DevMem2Df eig, float threshold, DevMem2Db mask, float2* corners, int max_count)
+        int findCorners_gpu(PtrStepSzf eig, float threshold, PtrStepSzb mask, float2* corners, int max_count)
        {
            void* counter_ptr;
            cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
@@ -134,7 +134,7 @@ namespace cv { namespace gpu { namespace device
        };
-        void sortCorners_gpu(DevMem2Df eig, float2* corners, int count)
+        void sortCorners_gpu(PtrStepSzf eig, float2* corners, int count)
        {
            bindTexture(&eigTex, eig);

--- a/modules/gpu/src/cuda/global_motion.cu
+++ b/modules/gpu/src/cuda/global_motion.cu
@@ -68,7 +68,7 @@ int compactPoints(int N, float *points0, float *points1, const uchar *mask)
 __global__ void calcWobbleSuppressionMapsKernel(
        const int left, const int idx, const int right, const int width, const int height,
-        PtrElemStepf mapx, PtrElemStepf mapy)
+        PtrStepf mapx, PtrStepf mapy)
 {
    const int x = blockDim.x * blockIdx.x + threadIdx.x;
    const int y = blockDim.y * blockIdx.y + threadIdx.y;
@@ -97,7 +97,7 @@ __global__ void calcWobbleSuppressionMapsKernel(
 void calcWobbleSuppressionMaps(
        int left, int idx, int right, int width, int height,
-        const float *ml, const float *mr, DevMem2Df mapx, DevMem2Df mapy)
+        const float *ml, const float *mr, PtrStepSzf mapx, PtrStepSzf mapy)
 {
    cudaSafeCall(cudaMemcpyToSymbol(cml, ml, 9*sizeof(float)));
    cudaSafeCall(cudaMemcpyToSymbol(cmr, mr, 9*sizeof(float)));

--- a/modules/gpu/src/cuda/hist.cu
+++ b/modules/gpu/src/cuda/hist.cu
@@ -169,10 +169,10 @@ namespace cv { namespace gpu { namespace device
                d_Histogram[blockIdx.x] = saturate_cast<int>(data[0]);
        }
-        void histogram256_gpu(DevMem2Db src, int* hist, uint* buf, cudaStream_t stream)
+        void histogram256_gpu(PtrStepSzb src, int* hist, uint* buf, cudaStream_t stream)
        {
            histogram256<<<PARTIAL_HISTOGRAM256_COUNT, HISTOGRAM256_THREADBLOCK_SIZE, 0, stream>>>(
-                DevMem2D_<uint>(src),
+                PtrStepSz<uint>(src),
                buf,
                static_cast<uint>(src.rows * src.step / sizeof(uint)),
                src.cols);
@@ -189,7 +189,7 @@ namespace cv { namespace gpu { namespace device
        __constant__ int c_lut[256];
-        __global__ void equalizeHist(const DevMem2Db src, PtrStepb dst)
+        __global__ void equalizeHist(const PtrStepSzb src, PtrStepb dst)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -202,7 +202,7 @@ namespace cv { namespace gpu { namespace device
            }
        }
-        void equalizeHist_gpu(DevMem2Db src, DevMem2Db dst, const int* lut, cudaStream_t stream)
+        void equalizeHist_gpu(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
        {
            dim3 block(16, 16);
            dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));

--- a/modules/gpu/src/cuda/hog.cu
+++ b/modules/gpu/src/cuda/hog.cu
@@ -110,8 +110,8 @@ namespace cv { namespace gpu { namespace device
        template <int nblocks> // Number of histogram blocks processed by single GPU thread block
-        __global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrElemStepf grad,
+        __global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrStepf grad,
-                                                         const PtrElemStep qangle, float scale, float* block_hists)
+                                                         const PtrStepb qangle, float scale, float* block_hists)
        {
            const int block_x = threadIdx.z;
            const int cell_x = threadIdx.x / 16;
@@ -149,8 +149,8 @@ namespace cv { namespace gpu { namespace device
                    float2 vote = *(const float2*)grad_ptr;
                    uchar2 bin = *(const uchar2*)qangle_ptr;
-                    grad_ptr += grad.step;
+                    grad_ptr += grad.step/grad.elemSize();
-                    qangle_ptr += qangle.step;
+                    qangle_ptr += qangle.step/qangle.elemSize();
                    int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
                    int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
@@ -188,8 +188,8 @@ namespace cv { namespace gpu { namespace device
        void compute_hists(int nbins, int block_stride_x, int block_stride_y,
-                           int height, int width, const DevMem2Df& grad,
+                           int height, int width, const PtrStepSzf& grad,
-                           const DevMem2Db& qangle, float sigma, float* block_hists)
+                           const PtrStepSzb& qangle, float sigma, float* block_hists)
        {
            const int nblocks = 1;
@@ -512,7 +512,7 @@ namespace cv { namespace gpu { namespace device
        template <int nthreads>
        __global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y,
-											          const float* block_hists, PtrElemStepf descriptors)
+											          const float* block_hists, PtrStepf descriptors)
        {
            // Get left top corner of the window in src
            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
@@ -532,7 +532,7 @@ namespace cv { namespace gpu { namespace device
        void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x,
-							        int height, int width, float* block_hists, DevMem2Df descriptors)
+							        int height, int width, float* block_hists, PtrStepSzf descriptors)
        {
            const int nthreads = 256;
@@ -555,7 +555,7 @@ namespace cv { namespace gpu { namespace device
        template <int nthreads>
        __global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x,
                                                      const int win_block_stride_y, const float* block_hists,
-                                                      PtrElemStepf descriptors)
+                                                      PtrStepf descriptors)
        {
            // Get left top corner of the window in src
            const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width +
@@ -581,7 +581,7 @@ namespace cv { namespace gpu { namespace device
        void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x,
                                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists,
-                                    DevMem2Df descriptors)
+                                    PtrStepSzf descriptors)
        {
            const int nthreads = 256;
@@ -605,8 +605,8 @@ namespace cv { namespace gpu { namespace device
        template <int nthreads, int correct_gamma>
-        __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrElemStep img,
+        __global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrStepb img,
-                                                      float angle_scale, PtrElemStepf grad, PtrElemStep qangle)
+                                                      float angle_scale, PtrStepf grad, PtrStepb qangle)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -707,8 +707,8 @@ namespace cv { namespace gpu { namespace device
        }
-        void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2Db& img,
+        void compute_gradients_8UC4(int nbins, int height, int width, const PtrStepSzb& img,
-                                    float angle_scale, DevMem2Df grad, DevMem2Db qangle, bool correct_gamma)
+                                    float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma)
        {
            (void)nbins;
            const int nthreads = 256;
@@ -727,8 +727,8 @@ namespace cv { namespace gpu { namespace device
        }
        template <int nthreads, int correct_gamma>
-        __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrElemStep img,
+        __global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrStepb img,
-                                                      float angle_scale, PtrElemStepf grad, PtrElemStep qangle)
+                                                      float angle_scale, PtrStepf grad, PtrStepb qangle)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -780,8 +780,8 @@ namespace cv { namespace gpu { namespace device
        }
-        void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2Db& img,
+        void compute_gradients_8UC1(int nbins, int height, int width, const PtrStepSzb& img,
-                                    float angle_scale, DevMem2Df grad, DevMem2Db qangle, bool correct_gamma)
+                                    float angle_scale, PtrStepSzf grad, PtrStepSzb qangle, bool correct_gamma)
        {
            (void)nbins;
            const int nthreads = 256;
@@ -807,7 +807,7 @@ namespace cv { namespace gpu { namespace device
        texture<uchar4, 2, cudaReadModeNormalizedFloat> resize8UC4_tex;
        texture<uchar,  2, cudaReadModeNormalizedFloat> resize8UC1_tex;
-        __global__ void resize_for_hog_kernel(float sx, float sy, DevMem2D_<uchar> dst, int colOfs)
+        __global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar> dst, int colOfs)
        {
            unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
            unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -816,7 +816,7 @@ namespace cv { namespace gpu { namespace device
                dst.ptr(y)[x] = tex2D(resize8UC1_tex, x * sx + colOfs, y * sy) * 255;
        }
-        __global__ void resize_for_hog_kernel(float sx, float sy, DevMem2D_<uchar4> dst, int colOfs)
+        __global__ void resize_for_hog_kernel(float sx, float sy, PtrStepSz<uchar4> dst, int colOfs)
        {
            unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
            unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -829,7 +829,7 @@ namespace cv { namespace gpu { namespace device
        }
        template<class T, class TEX>
-        static void resize_for_hog(const DevMem2Db& src, DevMem2Db dst, TEX& tex)
+        static void resize_for_hog(const PtrStepSzb& src, PtrStepSzb dst, TEX& tex)
        {
            tex.filterMode = cudaFilterModeLinear;
@@ -852,7 +852,7 @@ namespace cv { namespace gpu { namespace device
 	        float sx = static_cast<float>(src.cols) / dst.cols;
            float sy = static_cast<float>(src.rows) / dst.rows;
-            resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (DevMem2D_<T>)dst, colOfs);
+            resize_for_hog_kernel<<<grid, threads>>>(sx, sy, (PtrStepSz<T>)dst, colOfs);
            cudaSafeCall( cudaGetLastError() );
            cudaSafeCall( cudaDeviceSynchronize() );
@@ -860,7 +860,7 @@ namespace cv { namespace gpu { namespace device
            cudaSafeCall( cudaUnbindTexture(tex) );
        }
-        void resize_8UC1(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
+        void resize_8UC1(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar> (src, dst, resize8UC1_tex); }
-        void resize_8UC4(const DevMem2Db& src, DevMem2Db dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }
+        void resize_8UC4(const PtrStepSzb& src, PtrStepSzb dst) { resize_for_hog<uchar4>(src, dst, resize8UC4_tex); }
    } // namespace hog
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/hough.cu
+++ b/modules/gpu/src/cuda/hough.cu
@@ -55,7 +55,7 @@ namespace cv { namespace gpu { namespace device
        const int PIXELS_PER_THREAD = 16;
-        __global__ void buildPointList(const DevMem2Db src, unsigned int* list)
+        __global__ void buildPointList(const PtrStepSzb src, unsigned int* list)
        {
            __shared__ unsigned int s_queues[4][32 * PIXELS_PER_THREAD];
            __shared__ int s_qsize[4];
@@ -112,7 +112,7 @@ namespace cv { namespace gpu { namespace device
                list[gidx] = s_queues[threadIdx.y][i];
        }
-        int buildPointList_gpu(DevMem2Db src, unsigned int* list)
+        int buildPointList_gpu(PtrStepSzb src, unsigned int* list)
        {
            void* counterPtr;
            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
@@ -206,7 +206,7 @@ namespace cv { namespace gpu { namespace device
                accumRow[i] = smem[i];
        }
-        void linesAccum_gpu(const unsigned int* list, int count, DevMem2Di accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
+        void linesAccum_gpu(const unsigned int* list, int count, PtrStepSzi accum, float rho, float theta, size_t sharedMemPerBlock, bool has20)
        {
            const dim3 block(has20 ? 1024 : 512);
            const dim3 grid(accum.rows - 2);
@@ -226,7 +226,7 @@ namespace cv { namespace gpu { namespace device
        ////////////////////////////////////////////////////////////////////////
        // linesGetResult
-        __global__ void linesGetResult(const DevMem2Di accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho)
+        __global__ void linesGetResult(const PtrStepSzi accum, float2* out, int* votes, const int maxSize, const float rho, const float theta, const int threshold, const int numrho)
        {
            const int r = blockIdx.x * blockDim.x + threadIdx.x;
            const int n = blockIdx.y * blockDim.y + threadIdx.y;
@@ -254,7 +254,7 @@ namespace cv { namespace gpu { namespace device
            }
        }
-        int linesGetResult_gpu(DevMem2Di accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort)
+        int linesGetResult_gpu(PtrStepSzi accum, float2* out, int* votes, int maxSize, float rho, float theta, int threshold, bool doSort)
        {
            void* counterPtr;
            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
@@ -341,7 +341,7 @@ namespace cv { namespace gpu { namespace device
            }
        }
-        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, DevMem2Di accum, int minRadius, int maxRadius, float idp)
+        void circlesAccumCenters_gpu(const unsigned int* list, int count, PtrStepi dx, PtrStepi dy, PtrStepSzi accum, int minRadius, int maxRadius, float idp)
        {
            const dim3 block(256);
            const dim3 grid(divUp(count, block.x));
@@ -357,7 +357,7 @@ namespace cv { namespace gpu { namespace device
        ////////////////////////////////////////////////////////////////////////
        // buildCentersList
-        __global__ void buildCentersList(const DevMem2Di accum, unsigned int* centers, const int threshold)
+        __global__ void buildCentersList(const PtrStepSzi accum, unsigned int* centers, const int threshold)
        {
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
            const int y = blockIdx.y * blockDim.y + threadIdx.y;
@@ -381,7 +381,7 @@ namespace cv { namespace gpu { namespace device
            }
        }
-        int buildCentersList_gpu(DevMem2Di accum, unsigned int* centers, int threshold)
+        int buildCentersList_gpu(PtrStepSzi accum, unsigned int* centers, int threshold)
        {
            void* counterPtr;
            cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
@@ -467,7 +467,7 @@ namespace cv { namespace gpu { namespace device
            const dim3 block(has20 ? 1024 : 512);
            const dim3 grid(centersCount);
-            const int histSize = ::ceil(maxRadius - minRadius + 1);
+            const int histSize = maxRadius - minRadius + 1;
            size_t smemSize = (histSize + 2) * sizeof(int);
            circlesAccumRadius<<<grid, block, smemSize>>>(centers, list, count, circles, maxCircles, dp, minRadius, maxRadius, histSize, threshold);

--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
--- a/modules/gpu/src/cuda/integral_image.cu
+++ b/modules/gpu/src/cuda/integral_image.cu
@@ -57,7 +57,7 @@ namespace cv { namespace gpu { namespace device
            return bytes;
        }
-        __global__ void shfl_integral_horizontal(const PtrStep_<uint4> img, PtrStep_<uint4> integral)
+        __global__ void shfl_integral_horizontal(const PtrStep<uint4> img, PtrStep<uint4> integral)
        {
        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
            __shared__ int sums[128];
@@ -297,7 +297,7 @@ namespace cv { namespace gpu { namespace device
        // The final set of sums from the block is then propgated, with the block
        // computing "down" the image and adding the running sum to the local
        // block sums.
-        __global__ void shfl_integral_vertical(DevMem2D_<unsigned int> integral)
+        __global__ void shfl_integral_vertical(PtrStepSz<unsigned int> integral)
        {
        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 300)
            __shared__ unsigned int sums[32][9];
@@ -355,7 +355,7 @@ namespace cv { namespace gpu { namespace device
        #endif
        }
-        void shfl_integral_gpu(DevMem2Db img, DevMem2D_<unsigned int> integral, cudaStream_t stream)
+        void shfl_integral_gpu(PtrStepSzb img, PtrStepSz<unsigned int> integral, cudaStream_t stream)
        {
            {
                // each thread handles 16 values, use 1 block/row
@@ -366,7 +366,7 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaFuncSetCacheConfig(shfl_integral_horizontal, cudaFuncCachePreferL1) );
-                shfl_integral_horizontal<<<grid, block, 0, stream>>>((DevMem2D_<uint4>) img, (DevMem2D_<uint4>) integral);
+                shfl_integral_horizontal<<<grid, block, 0, stream>>>((PtrStepSz<uint4>) img, (PtrStepSz<uint4>) integral);
                cudaSafeCall( cudaGetLastError() );
            }

--- a/modules/gpu/src/cuda/lbp.cu
+++ b/modules/gpu/src/cuda/lbp.cu
@@ -181,7 +181,7 @@ namespace cv { namespace gpu { namespace device
            }
        }
-        void connectedConmonents(DevMem2D_<int4> candidates, int ncandidates, DevMem2D_<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
+        void connectedConmonents(PtrStepSz<int4> candidates, int ncandidates, PtrStepSz<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
        {
            int block = ncandidates;
            int smem  = block * ( sizeof(int) + sizeof(int4) );
@@ -240,7 +240,7 @@ namespace cv { namespace gpu { namespace device
        // stepShift, scale, width_k, sum_prev => y =  sum_prev + tid_k / width_k, x = tid_k - tid_k / width_k
        __global__ void lbp_cascade(const Cascade cascade, int frameW, int frameH, int windowW, int windowH, float scale, const float factor,
-            const int total, int* integral, const int pitch, DevMem2D_<int4> objects, unsigned int* classified)
+            const int total, int* integral, const int pitch, PtrStepSz<int4> objects, unsigned int* classified)
        {
            int ftid = blockIdx.x * blockDim.x + threadIdx.x;
            if (ftid >= total) return;
@@ -285,8 +285,8 @@ namespace cv { namespace gpu { namespace device
        }
        void classifyPyramid(int frameW, int frameH, int windowW, int windowH, float initialScale, float factor, int workAmount,
-            const DevMem2Db& mstages, const int nstages, const DevMem2Di& mnodes, const DevMem2Df& mleaves, const DevMem2Di& msubsets, const DevMem2Db& mfeatures,
+            const PtrStepSzb& mstages, const int nstages, const PtrStepSzi& mnodes, const PtrStepSzf& mleaves, const PtrStepSzi& msubsets, const PtrStepSzb& mfeatures,
-            const int subsetSize, DevMem2D_<int4> objects, unsigned int* classified, DevMem2Di integral)
+            const int subsetSize, PtrStepSz<int4> objects, unsigned int* classified, PtrStepSzi integral)
        {
            const int block = 128;
            int grid = divUp(workAmount, block);

--- a/modules/gpu/src/cuda/match_template.cu
+++ b/modules/gpu/src/cuda/match_template.cu
--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -130,7 +130,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <typename Mag, typename Angle>
-        void cartToPolar_caller(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
+        void cartToPolar_caller(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
        {
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
@@ -149,9 +149,9 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }
-        void cartToPolar_gpu(DevMem2Df x, DevMem2Df y, DevMem2Df mag, bool magSqr, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream)
+        void cartToPolar_gpu(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, bool magSqr, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream)
        {
-            typedef void (*caller_t)(DevMem2Df x, DevMem2Df y, DevMem2Df mag, DevMem2Df angle, bool angleInDegrees, cudaStream_t stream);
+            typedef void (*caller_t)(PtrStepSzf x, PtrStepSzf y, PtrStepSzf mag, PtrStepSzf angle, bool angleInDegrees, cudaStream_t stream);
            static const caller_t callers[2][2][2] =
            {
                {
@@ -180,7 +180,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <typename Mag>
-        void polarToCart_caller(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
+        void polarToCart_caller(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
        {
            dim3 threads(32, 8, 1);
            dim3 grid(1, 1, 1);
@@ -198,9 +198,9 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }
-        void polarToCart_gpu(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream)
+        void polarToCart_gpu(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream)
        {
-            typedef void (*caller_t)(DevMem2Df mag, DevMem2Df angle, DevMem2Df x, DevMem2Df y, bool angleInDegrees, cudaStream_t stream);
+            typedef void (*caller_t)(PtrStepSzf mag, PtrStepSzf angle, PtrStepSzf x, PtrStepSzf y, bool angleInDegrees, cudaStream_t stream);
            static const caller_t callers[2] =
            {
                polarToCart_caller<NonEmptyMag>,

--- a/modules/gpu/src/cuda/matrix_reductions.cu
+++ b/modules/gpu/src/cuda/matrix_reductions.cu
--- a/modules/gpu/src/cuda/optical_flow.cu
+++ b/modules/gpu/src/cuda/optical_flow.cu
@@ -49,7 +49,7 @@ namespace cv { namespace gpu { namespace device
        #define NEEDLE_MAP_SCALE 16
        #define NUM_VERTS_PER_ARROW 6
-        __global__ void NeedleMapAverageKernel(const DevMem2Df u, const PtrStepf v, PtrStepf u_avg, PtrStepf v_avg)
+        __global__ void NeedleMapAverageKernel(const PtrStepSzf u, const PtrStepf v, PtrStepf u_avg, PtrStepf v_avg)
        {
            __shared__ float smem[2 * NEEDLE_MAP_SCALE];
@@ -111,7 +111,7 @@ namespace cv { namespace gpu { namespace device
            }
        }
-        void NeedleMapAverage_gpu(DevMem2Df u, DevMem2Df v, DevMem2Df u_avg, DevMem2Df v_avg)
+        void NeedleMapAverage_gpu(PtrStepSzf u, PtrStepSzf v, PtrStepSzf u_avg, PtrStepSzf v_avg)
        {
            const dim3 block(NEEDLE_MAP_SCALE);
            const dim3 grid(u_avg.cols, u_avg.rows);
@@ -122,7 +122,7 @@ namespace cv { namespace gpu { namespace device
            cudaSafeCall( cudaDeviceSynchronize() );
        }
-        __global__ void NeedleMapVertexKernel(const DevMem2Df u_avg, const PtrStepf v_avg, float* vertex_data, float* color_data, float max_flow, float xscale, float yscale)
+        __global__ void NeedleMapVertexKernel(const PtrStepSzf u_avg, const PtrStepf v_avg, float* vertex_data, float* color_data, float max_flow, float xscale, float yscale)
        {
            // test - just draw a triangle at each pixel
            const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -202,7 +202,7 @@ namespace cv { namespace gpu { namespace device
            }
        }
-        void CreateOpticalFlowNeedleMap_gpu(DevMem2Df u_avg, DevMem2Df v_avg, float* vertex_buffer, float* color_data, float max_flow, float xscale, float yscale)
+        void CreateOpticalFlowNeedleMap_gpu(PtrStepSzf u_avg, PtrStepSzf v_avg, float* vertex_buffer, float* color_data, float max_flow, float xscale, float yscale)
        {
            const dim3 block(16);
            const dim3 grid(divUp(u_avg.cols, block.x), divUp(u_avg.rows, block.y));

--- a/modules/gpu/src/cuda/optical_flow_farneback.cu
+++ b/modules/gpu/src/cuda/optical_flow_farneback.cu
@@ -135,7 +135,7 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
    }
-    void polynomialExpansionGpu(const DevMem2Df &src, int polyN, DevMem2Df dst, cudaStream_t stream)
+    void polynomialExpansionGpu(const PtrStepSzf &src, int polyN, PtrStepSzf dst, cudaStream_t stream)
    {
        dim3 block(256);
        dim3 grid(divUp(src.cols, block.x - 2*polyN), src.rows);
@@ -251,8 +251,8 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
    void updateMatricesGpu(
-            const DevMem2Df flowx, const DevMem2Df flowy, const DevMem2Df R0, const DevMem2Df R1,
+            const PtrStepSzf flowx, const PtrStepSzf flowy, const PtrStepSzf R0, const PtrStepSzf R1,
-            DevMem2Df M, cudaStream_t stream)
+            PtrStepSzf M, cudaStream_t stream)
    {
        dim3 block(32, 8);
        dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
@@ -288,7 +288,7 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
    }
-    void updateFlowGpu(const DevMem2Df M, DevMem2Df flowx, DevMem2Df flowy, cudaStream_t stream)
+    void updateFlowGpu(const PtrStepSzf M, PtrStepSzf flowx, PtrStepSzf flowy, cudaStream_t stream)
    {
        dim3 block(32, 8);
        dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
@@ -340,7 +340,7 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
    }
-    void boxFilterGpu(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream)
+    void boxFilterGpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
    {
        dim3 block(256);
        dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
@@ -414,7 +414,7 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
    }
-    void boxFilter5Gpu(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream)
+    void boxFilter5Gpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
    {
        int height = src.rows / 5;
        int width = src.cols;
@@ -433,7 +433,7 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
    }
-    void boxFilter5Gpu_CC11(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream)
+    void boxFilter5Gpu_CC11(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
    {
        int height = src.rows / 5;
        int width = src.cols;
@@ -501,7 +501,7 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
    template <typename Border>
-    void gaussianBlurCaller(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream)
+    void gaussianBlurCaller(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
    {
        int height = src.rows;
        int width = src.cols;
@@ -521,9 +521,9 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
    void gaussianBlurGpu(
-            const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderMode, cudaStream_t stream)
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
    {
-        typedef void (*caller_t)(const DevMem2Df, int, DevMem2Df, cudaStream_t);
+        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
        static const caller_t callers[] =
        {
@@ -596,7 +596,7 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
    template <typename Border, int blockDimX>
    void gaussianBlur5Caller(
-            const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream)
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
    {
        int height = src.rows / 5;
        int width = src.cols;
@@ -616,9 +616,9 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
    void gaussianBlur5Gpu(
-            const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderMode, cudaStream_t stream)
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
    {
-        typedef void (*caller_t)(const DevMem2Df, int, DevMem2Df, cudaStream_t);
+        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
        static const caller_t callers[] =
        {
@@ -630,9 +630,9 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
    }
    void gaussianBlur5Gpu_CC11(
-            const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderMode, cudaStream_t stream)
+            const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
    {
-        typedef void (*caller_t)(const DevMem2Df, int, DevMem2Df, cudaStream_t);
+        typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
        static const caller_t callers[] =
        {

--- a/modules/gpu/src/cuda/orb.cu
+++ b/modules/gpu/src/cuda/orb.cu
@@ -122,7 +122,7 @@ namespace cv { namespace gpu { namespace device
            }
        }
-        void HarrisResponses_gpu(DevMem2Db img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream)
+        void HarrisResponses_gpu(PtrStepSzb img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream)
        {
            dim3 block(32, 8);
@@ -201,7 +201,7 @@ namespace cv { namespace gpu { namespace device
            }
        }
-        void IC_Angle_gpu(DevMem2Db image, const short2* loc, float* angle, int npoints, int half_k, cudaStream_t stream)
+        void IC_Angle_gpu(PtrStepSzb image, const short2* loc, float* angle, int npoints, int half_k, cudaStream_t stream)
        {
            dim3 block(32, 8);

--- a/modules/gpu/src/cuda/pyr_down.cu
+++ b/modules/gpu/src/cuda/pyr_down.cu
@@ -124,7 +124,7 @@ namespace cv { namespace gpu { namespace device
            }
        }
-        template <typename T, template <typename> class B> void pyrDown_caller(DevMem2D_<T> src, DevMem2D_<T> dst, cudaStream_t stream)
+        template <typename T, template <typename> class B> void pyrDown_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
        {
            const dim3 block(256);
            const dim3 grid(divUp(src.cols, block.x), dst.rows);
@@ -138,39 +138,39 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }
-        template <typename T> void pyrDown_gpu(DevMem2Db src, DevMem2Db dst, cudaStream_t stream)
+        template <typename T> void pyrDown_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
        {
-            pyrDown_caller<T, BrdReflect101>(static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(dst), stream);
+            pyrDown_caller<T, BrdReflect101>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
        }
-        template void pyrDown_gpu<uchar>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<uchar2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<uchar3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<uchar4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<schar>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<char2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<char3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<char4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<ushort>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<ushort2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<ushort3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<ushort4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<short>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<short2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<short3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<short4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<float>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<float2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<float3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrDown_gpu<float4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrDown_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    } // namespace imgproc
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/pyr_up.cu
+++ b/modules/gpu/src/cuda/pyr_up.cu
@@ -50,7 +50,7 @@ namespace cv { namespace gpu { namespace device
 {
    namespace imgproc
    {
-        template <typename T> __global__ void pyrUp(const DevMem2D_<T> src, DevMem2D_<T> dst)
+        template <typename T> __global__ void pyrUp(const PtrStepSz<T> src, PtrStepSz<T> dst)
        {
            typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t;
@@ -142,7 +142,7 @@ namespace cv { namespace gpu { namespace device
                dst(y, x) = saturate_cast<T>(4.0f * sum);
        }
-        template <typename T> void pyrUp_caller(DevMem2D_<T> src, DevMem2D_<T> dst, cudaStream_t stream)
+        template <typename T> void pyrUp_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
        {
            const dim3 block(16, 16);
            const dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));
@@ -154,39 +154,39 @@ namespace cv { namespace gpu { namespace device
                cudaSafeCall( cudaDeviceSynchronize() );
        }
-        template <typename T> void pyrUp_gpu(DevMem2Db src, DevMem2Db dst, cudaStream_t stream)
+        template <typename T> void pyrUp_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
        {
-            pyrUp_caller<T>(static_cast< DevMem2D_<T> >(src), static_cast< DevMem2D_<T> >(dst), stream);
+            pyrUp_caller<T>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
        }
-        template void pyrUp_gpu<uchar>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<uchar2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<uchar3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<uchar4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<schar>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<char2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<char3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<char4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<ushort>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<ushort2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<ushort3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<ushort4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<short>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<short2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<short3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<short4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<int>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<int2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<int3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<int4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<float>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrUp_gpu<float2>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        //template void pyrUp_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<float3>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        template void pyrUp_gpu<float4>(DevMem2Db src, DevMem2Db dst, cudaStream_t stream);
+        template void pyrUp_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    } // namespace imgproc
 }}} // namespace cv { namespace gpu { namespace device
--- a/modules/gpu/src/cuda/pyrlk.cu
+++ b/modules/gpu/src/cuda/pyrlk.cu
--- a/modules/gpu/src/cuda/remap.cu
+++ b/modules/gpu/src/cuda/remap.cu
--- a/modules/gpu/src/cuda/resize.cu
+++ b/modules/gpu/src/cuda/resize.cu
--- a/modules/gpu/src/cuda/rgb_to_yv12.cu
+++ b/modules/gpu/src/cuda/rgb_to_yv12.cu
--- a/modules/gpu/src/cuda/row_filter.cu
+++ b/modules/gpu/src/cuda/row_filter.cu
--- a/modules/gpu/src/cuda/split_merge.cu
+++ b/modules/gpu/src/cuda/split_merge.cu
--- a/modules/gpu/src/cuda/stereobm.cu
+++ b/modules/gpu/src/cuda/stereobm.cu
--- a/modules/gpu/src/cuda/stereobp.cu
+++ b/modules/gpu/src/cuda/stereobp.cu
--- a/modules/gpu/src/cuda/stereocsbp.cu
+++ b/modules/gpu/src/cuda/stereocsbp.cu
--- a/modules/gpu/src/cuda/surf.cu
+++ b/modules/gpu/src/cuda/surf.cu
--- a/modules/gpu/src/cuda/warp.cu
+++ b/modules/gpu/src/cuda/warp.cu
--- a/modules/gpu/src/cvt_colot_internal.h
+++ b/modules/gpu/src/cvt_colot_internal.h
@@ -46,7 +46,7 @@
 namespace cv { namespace gpu { namespace device
 {
 #define OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name)                                   \
-    void name(const DevMem2Db& src, const DevMem2Db& dst, cudaStream_t stream);
+    void name(const PtrStepSzb& src, const PtrStepSzb& dst, cudaStream_t stream);
 #define OPENCV_GPU_DECLARE_CVTCOLOR_ALL(name)       \
    OPENCV_GPU_DECLARE_CVTCOLOR_ONE(name ## _8u)    \

--- a/modules/gpu/src/element_operations.cpp
+++ b/modules/gpu/src/element_operations.cpp
--- a/modules/gpu/src/fast.cpp
+++ b/modules/gpu/src/fast.cpp
--- a/modules/gpu/src/fgd_bgfg.cpp
+++ b/modules/gpu/src/fgd_bgfg.cpp
--- a/modules/gpu/src/filtering.cpp
+++ b/modules/gpu/src/filtering.cpp
--- a/modules/gpu/src/gftt.cpp
+++ b/modules/gpu/src/gftt.cpp
--- a/modules/gpu/src/global_motion.cpp
+++ b/modules/gpu/src/global_motion.cpp
--- a/modules/gpu/src/graphcuts.cpp
+++ b/modules/gpu/src/graphcuts.cpp
--- a/modules/gpu/src/hog.cpp
+++ b/modules/gpu/src/hog.cpp
--- a/modules/gpu/src/hough.cpp
+++ b/modules/gpu/src/hough.cpp
--- a/modules/gpu/src/imgproc.cpp
+++ b/modules/gpu/src/imgproc.cpp
--- a/modules/gpu/src/match_template.cpp
+++ b/modules/gpu/src/match_template.cpp
--- a/modules/gpu/src/matrix_reductions.cpp
+++ b/modules/gpu/src/matrix_reductions.cpp
--- a/modules/gpu/src/nvidia/core/NCVPyramid.cu
+++ b/modules/gpu/src/nvidia/core/NCVPyramid.cu
--- a/modules/gpu/src/opencv2/gpu/device/common.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/common.hpp
--- a/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/detail/transform_detail.hpp
--- a/modules/gpu/src/opencv2/gpu/device/transform.hpp
+++ b/modules/gpu/src/opencv2/gpu/device/transform.hpp
--- a/modules/gpu/src/optical_flow.cpp
+++ b/modules/gpu/src/optical_flow.cpp
--- a/modules/gpu/src/optical_flow_farneback.cpp
+++ b/modules/gpu/src/optical_flow_farneback.cpp
--- a/modules/gpu/src/orb.cpp
+++ b/modules/gpu/src/orb.cpp
--- a/modules/gpu/src/pyramids.cpp
+++ b/modules/gpu/src/pyramids.cpp
--- a/modules/gpu/src/pyrlk.cpp
+++ b/modules/gpu/src/pyrlk.cpp
--- a/modules/gpu/src/remap.cpp
+++ b/modules/gpu/src/remap.cpp
--- a/modules/gpu/src/resize.cpp
+++ b/modules/gpu/src/resize.cpp
--- a/modules/gpu/src/split_merge.cpp
+++ b/modules/gpu/src/split_merge.cpp
--- a/modules/gpu/src/stereobm.cpp
+++ b/modules/gpu/src/stereobm.cpp
--- a/modules/gpu/src/stereobp.cpp
+++ b/modules/gpu/src/stereobp.cpp
--- a/modules/gpu/src/stereocsbp.cpp
+++ b/modules/gpu/src/stereocsbp.cpp
--- a/modules/gpu/src/surf.cpp
+++ b/modules/gpu/src/surf.cpp
--- a/modules/gpu/src/video_reader.cpp
+++ b/modules/gpu/src/video_reader.cpp
--- a/modules/gpu/src/video_writer.cpp
+++ b/modules/gpu/src/video_writer.cpp
--- a/modules/gpu/src/warp.cpp
+++ b/modules/gpu/src/warp.cpp
--- a/modules/gpu/test/test_imgproc.cpp
+++ b/modules/gpu/test/test_imgproc.cpp
--- a/modules/imgproc/src/imgwarp.cpp
+++ b/modules/imgproc/src/imgwarp.cpp
--- a/modules/video/src/bgfg_gmg.cpp
+++ b/modules/video/src/bgfg_gmg.cpp