Modified sparse pyrlk optical flow to allow input of an image pyramid which thus…

Modified sparse pyrlk optical flow to allow input of an image pyramid which thus allows caching of image pyramids on successive calls. Added unsigned char support for 1, 3, 4 channel images.

Modified sparse pyrlk optical flow to allow input of an image pyramid which thus…
Modified sparse pyrlk optical flow to allow input of an image pyramid which thus allows caching of image pyramids on successive calls. Added unsigned char support for 1, 3, 4 channel images.
66738d74 · Dan Moodie · 8d79285d · 66738d74 · 66738d74 · 66738d74
Commit 66738d74 authored Dec 29, 2015 by Dan Moodie
7 changed files
--- a/modules/cudaoptflow/perf/perf_optflow.cpp
+++ b/modules/cudaoptflow/perf/perf_optflow.cpp
@@ -116,10 +116,10 @@ PERF_TEST_P(ImagePair_Gray_NPts_WinSz_Levels_Iters, PyrLKOpticalFlowSparse,
    const int levels = GET_PARAM(4);
    const int iters = GET_PARAM(5);
-    const cv::Mat frame0 = readImage(imagePair.first, useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
+    cv::Mat frame0 = readImage(imagePair.first, useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
    ASSERT_FALSE(frame0.empty());
-    const cv::Mat frame1 = readImage(imagePair.second, useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
+    cv::Mat frame1 = readImage(imagePair.second, useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
    ASSERT_FALSE(frame1.empty());
    cv::Mat gray_frame;
@@ -131,6 +131,14 @@ PERF_TEST_P(ImagePair_Gray_NPts_WinSz_Levels_Iters, PyrLKOpticalFlowSparse,
    cv::Mat pts;
    cv::goodFeaturesToTrack(gray_frame, pts, points, 0.01, 0.0);
+    frame0.convertTo(frame0, CV_32F);
+    frame1.convertTo(frame1, CV_32F);
+    if(!useGray)
+    {
+        cv::cvtColor(frame0, frame0, cv::COLOR_BGR2BGRA);
+        cv::cvtColor(frame1, frame1, cv::COLOR_BGR2BGRA);
+    }
    if (PERF_RUN_CUDA())
    {
        const cv::cuda::GpuMat d_pts(pts.reshape(2, 1));
@@ -318,4 +326,4 @@ PERF_TEST_P(ImagePair, OpticalFlowDual_TVL1,
        CPU_SANITY_CHECK(flow);
    }
 }
\ No newline at end of file
--- a/modules/cudaoptflow/src/cuda/pyrlk.cu
+++ b/modules/cudaoptflow/src/cuda/pyrlk.cu
@@ -48,6 +48,8 @@
 #include "opencv2/core/cuda/limits.hpp"
 #include "opencv2/core/cuda/vec_math.hpp"
 #include "opencv2/core/cuda/reduce.hpp"
+#include "opencv2/core/cuda/filters.hpp"
+#include "opencv2/core/cuda/border_interpolate.hpp"
 using namespace cv::cuda;
 using namespace cv::cuda::device;
@@ -60,53 +62,240 @@ namespace pyrlk
    __constant__ int c_halfWin_y;
    __constant__ int c_iters;
+    texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I8U(false, cudaFilterModeLinear, cudaAddressModeClamp);
+    texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I8UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
+    texture<ushort4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_I16UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp);
    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp);
    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp);
+    texture<uchar, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J8U(false, cudaFilterModeLinear, cudaAddressModeClamp);
+    texture<uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J8UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
+    texture<ushort4, cudaTextureType2D, cudaReadModeNormalizedFloat> tex_J16UC4(false, cudaFilterModeLinear, cudaAddressModeClamp);
    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp);
    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp);
-    template <int cn> struct Tex_I;
-    template <> struct Tex_I<1>
+    template <int cn, typename T> struct Tex_I
+    {
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<typename TypeVec<T, cn>::vec_type> I)
+        {
+            (void)I;
+        }
+    };
+    template <> struct Tex_I<1, uchar>
+    {
+        static __device__ __forceinline__ float read(float x, float y)
+        {
+            return tex2D(tex_I8U, x, y);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar>& I)
+        {
+            bindTexture(&tex_I8U, I);
+        }
+    };
+    template <> struct Tex_I<1, ushort>
+    {
+        static __device__ __forceinline__ float read(float x, float y)
+        {
+            return 0.0;
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort>& I)
+        {
+            (void)I;
+        }
+    };
+    template <> struct Tex_I<1, int>
+    {
+        static __device__ __forceinline__ float read(float x, float y)
+        {
+            return 0.0;
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<int>& I)
+        {
+            (void)I;
+        }
+    };
+    template <> struct Tex_I<1, float>
    {
        static __device__ __forceinline__ float read(float x, float y)
        {
            return tex2D(tex_If, x, y);
        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float>& I)
+        {
+            bindTexture(&tex_If, I);
+        }
    };
-    template <> struct Tex_I<4>
+    // ****************** 3 channel specializations ************************
+    template <> struct Tex_I<3, uchar>
+    {
+        static __device__ __forceinline__ float3 read(float x, float y)
+        {
+            return make_float3(0,0,0);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar3> I)
+        {
+            (void)I;
+        }
+    };
+    template <> struct Tex_I<3, ushort>
+    {
+        static __device__ __forceinline__ float3 read(float x, float y)
+        {
+            return make_float3(0, 0, 0);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort3> I)
+        {
+            (void)I;
+        }
+    };
+    template <> struct Tex_I<3, int>
+    {
+        static __device__ __forceinline__ float3 read(float x, float y)
+        {
+            return make_float3(0, 0, 0);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<int3> I)
+        {
+            (void)I;
+        }
+    };
+    template <> struct Tex_I<3, float>
+    {
+        static __device__ __forceinline__ float3 read(float x, float y)
+        {
+            return make_float3(0, 0, 0);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float3> I)
+        {
+            (void)I;
+        }
+    };
+    // ****************** 4 channel specializations ************************
+    template <> struct Tex_I<4, uchar>
+    {
+        static __device__ __forceinline__ float4 read(float x, float y)
+        {
+            return tex2D(tex_I8UC4, x, y);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar4>& I)
+        {
+            bindTexture(&tex_I8UC4, I);
+        }
+    };
+    template <> struct Tex_I<4, ushort>
+    {
+        static __device__ __forceinline__ float4 read(float x, float y)
+        {
+            return tex2D(tex_I16UC4, x, y);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort4>& I)
+        {
+            bindTexture(&tex_I16UC4, I);
+        }
+    };
+    template <> struct Tex_I<4, float>
    {
        static __device__ __forceinline__ float4 read(float x, float y)
        {
            return tex2D(tex_If4, x, y);
        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float4>& I)
+        {
+            bindTexture(&tex_If4, I);
+        }
    };
+    // ************* J  ***************
-    template <int cn> struct Tex_J;
+    template <int cn, typename T> struct Tex_J
-    template <> struct Tex_J<1>
+    {
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<typename TypeVec<T,cn>::vec_type>& J)
+        {
+            (void)J;
+        }
+    };
+    template <> struct Tex_J<1, uchar>
+    {
+        static __device__ __forceinline__ float read(float x, float y)
+        {
+            return tex2D(tex_J8U, x, y);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar>& J)
+        {
+            bindTexture(&tex_J8U, J);
+        }
+    };
+    template <> struct Tex_J<1, float>
    {
        static __device__ __forceinline__ float read(float x, float y)
        {
            return tex2D(tex_Jf, x, y);
        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float>& J)
+        {
+            bindTexture(&tex_Jf, J);
+        }
    };
-    template <> struct Tex_J<4>
+    // ************* 4 channel specializations ***************
+    template <> struct Tex_J<4, uchar>
+    {
+        static __device__ __forceinline__ float4 read(float x, float y)
+        {
+            return tex2D(tex_J8UC4, x, y);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<uchar4>& J)
+        {
+            bindTexture(&tex_J8UC4, J);
+        }
+    };
+    template <> struct Tex_J<4, ushort>
+    {
+        static __device__ __forceinline__ float4 read(float x, float y)
+        {
+            return tex2D(tex_J16UC4, x, y);
+        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<ushort4>& J)
+        {
+            bindTexture(&tex_J16UC4, J);
+        }
+    };
+    template <> struct Tex_J<4, float>
    {
        static __device__ __forceinline__ float4 read(float x, float y)
        {
            return tex2D(tex_Jf4, x, y);
        }
+        static __host__ __forceinline__ void bindTexture_(PtrStepSz<float4>& J)
+        {
+            bindTexture(&tex_Jf4, J);
+        }
    };
-    __device__ __forceinline__ void accum(float& dst, float val)
+    __device__ __forceinline__ void accum(float& dst, const float& val)
    {
        dst += val;
    }
-    __device__ __forceinline__ void accum(float& dst, const float4& val)
+    __device__ __forceinline__ void accum(float& dst, const float2& val)
+    {
+        dst += val.x + val.y;
+    }
+    __device__ __forceinline__ void accum(float& dst, const float3& val)
    {
        dst += val.x + val.y + val.z;
    }
+    __device__ __forceinline__ void accum(float& dst, const float4& val)
+    {
+        dst += val.x + val.y + val.z + val.w;
+    }
    __device__ __forceinline__ float abs_(float a)
    {
@@ -116,8 +305,46 @@ namespace pyrlk
    {
        return abs(a);
    }
+    __device__ __forceinline__ float2 abs_(const float2& a)
+    {
+        return abs(a);
+    }
+    __device__ __forceinline__ float3 abs_(const float3& a)
+    {
+        return abs(a);
+    }
-    template <int cn, int PATCH_X, int PATCH_Y, bool calcErr>
+    template<typename T> __device__ __forceinline__ typename TypeVec<float, 1>::vec_type ToFloat(const typename TypeVec<T, 1>::vec_type& other)
+    {
+        return other;
+    }
+    template<typename T> __device__ __forceinline__  typename TypeVec<float, 2>::vec_type ToFloat(const typename TypeVec<T, 2>::vec_type& other)
+    {
+        typename TypeVec<float, 2>::vec_type ret;
+        ret.x = other.x;
+        ret.y = other.y;
+        return ret;
+    }
+    template<typename T> __device__ __forceinline__  typename TypeVec<float, 3>::vec_type ToFloat(const typename TypeVec<T, 3>::vec_type& other)
+    {
+        typename TypeVec<float, 3>::vec_type ret;
+        ret.x = other.x;
+        ret.y = other.y;
+        ret.z = other.z;
+        return ret;
+    }
+    template<typename T> __device__ __forceinline__  typename TypeVec<float, 4>::vec_type ToFloat(const typename TypeVec<T, 4>::vec_type& other)
+    {
+        typename TypeVec<float, 4>::vec_type ret;
+        ret.x = other.x;
+        ret.y = other.y;
+        ret.z = other.z;
+        ret.w = other.w;
+        return ret;
+    }
+    template <int cn, int PATCH_X, int PATCH_Y, bool calcErr, typename T>
    __global__ void sparseKernel(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
    {
    #if __CUDA_ARCH__ <= 110
@@ -166,15 +393,15 @@ namespace pyrlk
                float x = prevPt.x + xBase + 0.5f;
                float y = prevPt.y + yBase + 0.5f;
-                I_patch[i][j] = Tex_I<cn>::read(x, y);
+                I_patch[i][j] = Tex_I<cn, T>::read(x, y);
                // Sharr Deriv
-                work_type dIdx = 3.0f * Tex_I<cn>::read(x+1, y-1) + 10.0f * Tex_I<cn>::read(x+1, y) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
+                work_type dIdx = 3.0f * Tex_I<cn,T>::read(x+1, y-1) + 10.0f * Tex_I<cn, T>::read(x+1, y) + 3.0f * Tex_I<cn,T>::read(x+1, y+1) -
-                                 (3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x-1, y) + 3.0f * Tex_I<cn>::read(x-1, y+1));
+                                 (3.0f * Tex_I<cn,T>::read(x-1, y-1) + 10.0f * Tex_I<cn, T>::read(x-1, y) + 3.0f * Tex_I<cn,T>::read(x-1, y+1));
-                work_type dIdy = 3.0f * Tex_I<cn>::read(x-1, y+1) + 10.0f * Tex_I<cn>::read(x, y+1) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
+                work_type dIdy = 3.0f * Tex_I<cn,T>::read(x-1, y+1) + 10.0f * Tex_I<cn, T>::read(x, y+1) + 3.0f * Tex_I<cn,T>::read(x+1, y+1) -
-                                (3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x, y-1) + 3.0f * Tex_I<cn>::read(x+1, y-1));
+                                (3.0f * Tex_I<cn,T>::read(x-1, y-1) + 10.0f * Tex_I<cn, T>::read(x, y-1) + 3.0f * Tex_I<cn,T>::read(x+1, y-1));
                dIdx_patch[i][j] = dIdx;
                dIdy_patch[i][j] = dIdy;
@@ -243,7 +470,7 @@ namespace pyrlk
                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
                {
                    work_type I_val = I_patch[i][j];
-                    work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
+                    work_type J_val = Tex_J<cn, T>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
                    work_type diff = (J_val - I_val) * 32.0f;
@@ -286,7 +513,7 @@ namespace pyrlk
                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
                {
                    work_type I_val = I_patch[i][j];
-                    work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
+                    work_type J_val = Tex_J<cn, T>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
                    work_type diff = J_val - I_val;
@@ -309,22 +536,352 @@ namespace pyrlk
        }
    }
-    template <int cn, int PATCH_X, int PATCH_Y>
+    // Kernel, uses non texture fetches
-    void sparse_caller(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+    template <int PATCH_X, int PATCH_Y, bool calcErr, int cn, typename T, typename Ptr2D>
-                       int level, dim3 block, cudaStream_t stream)
+    __global__ void sparseKernel_(Ptr2D I, Ptr2D J, const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
    {
-        dim3 grid(ptcount);
+#if __CUDA_ARCH__ <= 110
+        const int BLOCK_SIZE = 128;
+#else
+        const int BLOCK_SIZE = 256;
+#endif
-        if (level == 0 && err)
+        __shared__ float smem1[BLOCK_SIZE];
-            sparseKernel<cn, PATCH_X, PATCH_Y, true><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
+        __shared__ float smem2[BLOCK_SIZE];
-        else
+        __shared__ float smem3[BLOCK_SIZE];
-            sparseKernel<cn, PATCH_X, PATCH_Y, false><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
-        cudaSafeCall( cudaGetLastError() );
+        const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
+        float2 prevPt = prevPts[blockIdx.x];
+        prevPt.x *= (1.0f / (1 << level));
+        prevPt.y *= (1.0f / (1 << level));
+        if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
+        {
+            if (tid == 0 && level == 0)
+                status[blockIdx.x] = 0;
+            return;
+        }
+        prevPt.x -= c_halfWin_x;
+        prevPt.y -= c_halfWin_y;
+        // extract the patch from the first image, compute covariation matrix of derivatives
+        float A11 = 0;
+        float A12 = 0;
+        float A22 = 0;
+        typedef typename TypeVec<float, cn>::vec_type work_type;
+        work_type I_patch[PATCH_Y][PATCH_X];
+        work_type dIdx_patch[PATCH_Y][PATCH_X];
+        work_type dIdy_patch[PATCH_Y][PATCH_X];
+        for (int yBase = threadIdx.y, i = 0; yBase < c_winSize_y; yBase += blockDim.y, ++i)
+        {
+            for (int xBase = threadIdx.x, j = 0; xBase < c_winSize_x; xBase += blockDim.x, ++j)
+            {
+                float x = prevPt.x + xBase + 0.5f;
+                float y = prevPt.y + yBase + 0.5f;
+                I_patch[i][j] = ToFloat<T>(I(y, x));
+                // Sharr Deriv
+                work_type dIdx = 3.0f * I(y - 1, x + 1) + 10.0f * I(y, x + 1) + 3.0f * I(y + 1, x + 1) -
+                    (3.0f * I(y - 1, x - 1) + 10.0f * I(y, x - 1) + 3.0f * I(y + 1 , x - 1));
+                work_type dIdy = 3.0f * I(y + 1, x - 1) + 10.0f * I(y + 1, x) + 3.0f * I(y+1, x + 1) -
+                    (3.0f * I(y - 1, x - 1) + 10.0f * I(y-1, x) + 3.0f * I(y - 1, x + 1));
+                dIdx_patch[i][j] = dIdx;
+                dIdy_patch[i][j] = dIdy;
+                accum(A11, dIdx * dIdx);
+                accum(A12, dIdx * dIdy);
+                accum(A22, dIdy * dIdy);
+            }
+        }
+        reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2, smem3), thrust::tie(A11, A12, A22), tid, thrust::make_tuple(plus<float>(), plus<float>(), plus<float>()));
+#if __CUDA_ARCH__ >= 300
+        if (tid == 0)
+        {
+            smem1[0] = A11;
+            smem2[0] = A12;
+            smem3[0] = A22;
+        }
+#endif
+        __syncthreads();
+        A11 = smem1[0];
+        A12 = smem2[0];
+        A22 = smem3[0];
+        float D = A11 * A22 - A12 * A12;
+        if (D < numeric_limits<float>::epsilon())
+        {
+            if (tid == 0 && level == 0)
+                status[blockIdx.x] = 0;
+            return;
+        }
+        D = 1.f / D;
+        A11 *= D;
+        A12 *= D;
+        A22 *= D;
+        float2 nextPt = nextPts[blockIdx.x];
+        nextPt.x *= 2.f;
+        nextPt.y *= 2.f;
+        nextPt.x -= c_halfWin_x;
+        nextPt.y -= c_halfWin_y;
+        for (int k = 0; k < c_iters; ++k)
+        {
+            if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows)
+            {
+                if (tid == 0 && level == 0)
+                    status[blockIdx.x] = 0;
+                return;
+            }
+            float b1 = 0;
+            float b2 = 0;
+            for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
+            {
+                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
+                {
+                    work_type I_val = I_patch[i][j];
+                    work_type J_val = ToFloat<T>(J(nextPt.y + y + 0.5f, nextPt.x + x + 0.5f));
+                    work_type diff = (J_val - I_val) * 32.0f;
+                    accum(b1, diff * dIdx_patch[i][j]);
+                    accum(b2, diff * dIdy_patch[i][j]);
+                }
+            }
+            reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2), thrust::tie(b1, b2), tid, thrust::make_tuple(plus<float>(), plus<float>()));
+#if __CUDA_ARCH__ >= 300
+            if (tid == 0)
+            {
+                smem1[0] = b1;
+                smem2[0] = b2;
+            }
+#endif
+            __syncthreads();
+            b1 = smem1[0];
+            b2 = smem2[0];
+            float2 delta;
+            delta.x = A12 * b2 - A22 * b1;
+            delta.y = A12 * b1 - A11 * b2;
+            nextPt.x += delta.x;
+            nextPt.y += delta.y;
+            if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
+                break;
+        }
+        float errval = 0;
+        if (calcErr)
+        {
+            for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
+            {
+                for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
+                {
+                    work_type I_val = I_patch[i][j];
+                    work_type J_val = ToFloat<T>(J(nextPt.y + y + 0.5f, nextPt.x + x + 0.5f));
+                    work_type diff = J_val - I_val;
+                    accum(errval, abs_(diff));
+                }
+            }
+            reduce<BLOCK_SIZE>(smem1, errval, tid, plus<float>());
+        }
+        if (tid == 0)
+        {
+            nextPt.x += c_halfWin_x;
+            nextPt.y += c_halfWin_y;
+            nextPts[blockIdx.x] = nextPt;
+            if (calcErr)
+                err[blockIdx.x] = static_cast<float>(errval) / (3 * c_winSize_x * c_winSize_y);
+        }
+    } // __global__ void sparseKernel_
+    template <int cn, int PATCH_X, int PATCH_Y, typename T> class sparse_caller
+    {
+    public:
+        static void call(PtrStepSz<typename TypeVec<T, cn>::vec_type> I, PtrStepSz<typename TypeVec<T, cn>::vec_type> J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+            int level, dim3 block, cudaStream_t stream)
+        {
+            dim3 grid(ptcount);
+            (void)I;
+            (void)J;
+            if (level == 0 && err)
+                sparseKernel<cn, PATCH_X, PATCH_Y, true, T> <<<grid, block, 0, stream >>>(prevPts, nextPts, status, err, level, rows, cols);
+            else
+                sparseKernel<cn, PATCH_X, PATCH_Y, false, T> <<<grid, block, 0, stream >>>(prevPts, nextPts, status, err, level, rows, cols);
+            cudaSafeCall(cudaGetLastError());
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    };
+    // Specialization to use non texture path because for some reason the texture path keeps failing accuracy tests
+    template<int PATCH_X, int PATCH_Y> class sparse_caller<1, PATCH_X, PATCH_Y, unsigned short>
+    {
+    public:
+        typedef typename TypeVec<unsigned short, 1>::vec_type work_type;
+        typedef PtrStepSz<work_type> Ptr2D;
+        typedef BrdConstant<work_type> BrdType;
+        typedef BorderReader<Ptr2D, BrdType> Reader;
+        typedef LinearFilter<Reader> Filter;
+        static void call(Ptr2D I, Ptr2D J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+            int level, dim3 block, cudaStream_t stream)
+        {
+            dim3 grid(ptcount);
+            if (level == 0 && err)
+            {
+                sparseKernel_<PATCH_X, PATCH_Y, true, 1, unsigned short> <<<grid, block, 0, stream >>>(
+                    Filter(Reader(I, BrdType(rows, cols))),
+                    Filter(Reader(J, BrdType(rows, cols))),
+                    prevPts, nextPts, status, err, level, rows, cols);
+            }
+            else
+            {
+                sparseKernel_<PATCH_X, PATCH_Y, false, 1, unsigned short> <<<grid, block, 0, stream >>>(
+                    Filter(Reader(I, BrdType(rows, cols))),
+                    Filter(Reader(J, BrdType(rows, cols))),
+                    prevPts, nextPts, status, err, level, rows, cols);
+            }
+            cudaSafeCall(cudaGetLastError());
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    };
+    // Specialization for int because the texture path keeps failing
+    template<int PATCH_X, int PATCH_Y> class sparse_caller<1, PATCH_X, PATCH_Y, int>
+    {
+    public:
+        typedef typename TypeVec<int, 1>::vec_type work_type;
+        typedef PtrStepSz<work_type> Ptr2D;
+        typedef BrdConstant<work_type> BrdType;
+        typedef BorderReader<Ptr2D, BrdType> Reader;
+        typedef LinearFilter<Reader> Filter;
+        static void call(Ptr2D I, Ptr2D J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+            int level, dim3 block, cudaStream_t stream)
+        {
+            dim3 grid(ptcount);
+            if (level == 0 && err)
+            {
+                sparseKernel_<PATCH_X, PATCH_Y, true, 1, int> <<<grid, block, 0, stream >>>(
+                    Filter(Reader(I, BrdType(rows, cols))),
+                    Filter(Reader(J, BrdType(rows, cols))),
+                    prevPts, nextPts, status, err, level, rows, cols);
+            }
+            else
+            {
+                sparseKernel_<PATCH_X, PATCH_Y, false, 1, int> <<<grid, block, 0, stream >>>(
+                    Filter(Reader(I, BrdType(rows, cols))),
+                    Filter(Reader(J, BrdType(rows, cols))),
+                    prevPts, nextPts, status, err, level, rows, cols);
+            }
+            cudaSafeCall(cudaGetLastError());
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    };
+    template<int PATCH_X, int PATCH_Y> class sparse_caller<4, PATCH_X, PATCH_Y, int>
+    {
+    public:
+        typedef typename TypeVec<int, 4>::vec_type work_type;
+        typedef PtrStepSz<work_type> Ptr2D;
+        typedef BrdConstant<work_type> BrdType;
+        typedef BorderReader<Ptr2D, BrdType> Reader;
+        typedef LinearFilter<Reader> Filter;
+        static void call(Ptr2D I, Ptr2D J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+            int level, dim3 block, cudaStream_t stream)
+        {
+            dim3 grid(ptcount);
+            if (level == 0 && err)
+            {
+                sparseKernel_<PATCH_X, PATCH_Y, true, 4, int> <<<grid, block, 0, stream >>>(
+                    Filter(Reader(I, BrdType(rows, cols))),
+                    Filter(Reader(J, BrdType(rows, cols))),
+                    prevPts, nextPts, status, err, level, rows, cols);
+            }
+            else
+            {
+                sparseKernel_<PATCH_X, PATCH_Y, false, 4, int> <<<grid, block, 0, stream >>>(
+                    Filter(Reader(I, BrdType(rows, cols))),
+                    Filter(Reader(J, BrdType(rows, cols))),
+                    prevPts, nextPts, status, err, level, rows, cols);
+            }
+            cudaSafeCall(cudaGetLastError());
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    };
+    using namespace cv::cuda::device;
+    template <int PATCH_X, int PATCH_Y, typename T> class sparse_caller<3, PATCH_X, PATCH_Y, T>
+    {
+    public:
+        typedef typename TypeVec<T, 3>::vec_type work_type;
+        typedef PtrStepSz<work_type> Ptr2D;
+        typedef BrdConstant<work_type> BrdType;
+        typedef BorderReader<Ptr2D, BrdType> Reader;
+        typedef LinearFilter<Reader> Filter;
+        static void call(Ptr2D I, Ptr2D J, int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+            int level, dim3 block, cudaStream_t stream)
+        {
+            dim3 grid(ptcount);
+            if (level == 0 && err)
+            {
+                sparseKernel_<PATCH_X, PATCH_Y, true, 3, T> <<<grid, block, 0, stream >>>(
+                    Filter(Reader(I, BrdType(rows, cols))),
+                    Filter(Reader(J, BrdType(rows, cols))),
+                    prevPts, nextPts, status, err, level, rows, cols);
+            }
+            else
+            {
+                sparseKernel_<PATCH_X, PATCH_Y, false, 3, T> <<<grid, block, 0, stream >>>(
+                    Filter(Reader(I, BrdType(rows, cols))),
+                    Filter(Reader(J, BrdType(rows, cols))),
+                    prevPts, nextPts, status, err, level, rows, cols);
+            }
+            cudaSafeCall(cudaGetLastError());
+            if (stream == 0)
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    };
-        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
-    }
    template <bool calcErr>
    __global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
@@ -484,77 +1041,72 @@ namespace pyrlk
        cudaSafeCall( cudaMemcpyToSymbolAsync(c_iters, &iters, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
    }
-    void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+    template<typename T, int cn> struct pyrLK_caller
-                 int level, dim3 block, dim3 patch, cudaStream_t stream)
    {
-        typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+        static void sparse(PtrStepSz<typename TypeVec<T, cn>::vec_type> I, PtrStepSz<typename TypeVec<T, cn>::vec_type> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                               int level, dim3 block, cudaStream_t stream);
+            int level, dim3 block, dim3 patch, cudaStream_t stream)
-        static const func_t funcs[5][5] =
        {
-            {sparse_caller<1, 1, 1>, sparse_caller<1, 2, 1>, sparse_caller<1, 3, 1>, sparse_caller<1, 4, 1>, sparse_caller<1, 5, 1>},
+            typedef void(*func_t)(PtrStepSz<typename TypeVec<T, cn>::vec_type> I, PtrStepSz<typename TypeVec<T, cn>::vec_type> J,
-            {sparse_caller<1, 1, 2>, sparse_caller<1, 2, 2>, sparse_caller<1, 3, 2>, sparse_caller<1, 4, 2>, sparse_caller<1, 5, 2>},
+                int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-            {sparse_caller<1, 1, 3>, sparse_caller<1, 2, 3>, sparse_caller<1, 3, 3>, sparse_caller<1, 4, 3>, sparse_caller<1, 5, 3>},
+                int level, dim3 block, cudaStream_t stream);
-            {sparse_caller<1, 1, 4>, sparse_caller<1, 2, 4>, sparse_caller<1, 3, 4>, sparse_caller<1, 4, 4>, sparse_caller<1, 5, 4>},
-            {sparse_caller<1, 1, 5>, sparse_caller<1, 2, 5>, sparse_caller<1, 3, 5>, sparse_caller<1, 4, 5>, sparse_caller<1, 5, 5>}
-        };
-        bindTexture(&tex_If, I);
+            static const func_t funcs[5][5] =
-        bindTexture(&tex_Jf, J);
+            {
+                { sparse_caller<cn, 1, 1,T>::call, sparse_caller<cn, 2, 1,T>::call, sparse_caller<cn, 3, 1,T>::call, sparse_caller<cn, 4, 1,T>::call, sparse_caller<cn, 5, 1,T>::call },
-        funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
+                { sparse_caller<cn, 1, 2,T>::call, sparse_caller<cn, 2, 2,T>::call, sparse_caller<cn, 3, 2,T>::call, sparse_caller<cn, 4, 2,T>::call, sparse_caller<cn, 5, 2,T>::call },
-            level, block, stream);
+                { sparse_caller<cn, 1, 3,T>::call, sparse_caller<cn, 2, 3,T>::call, sparse_caller<cn, 3, 3,T>::call, sparse_caller<cn, 4, 3,T>::call, sparse_caller<cn, 5, 3,T>::call },
-    }
+                { sparse_caller<cn, 1, 4,T>::call, sparse_caller<cn, 2, 4,T>::call, sparse_caller<cn, 3, 4,T>::call, sparse_caller<cn, 4, 4,T>::call, sparse_caller<cn, 5, 4,T>::call },
+                { sparse_caller<cn, 1, 5,T>::call, sparse_caller<cn, 2, 5,T>::call, sparse_caller<cn, 3, 5,T>::call, sparse_caller<cn, 4, 5,T>::call, sparse_caller<cn, 5, 5,T>::call }
-    void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+            };
-                 int level, dim3 block, dim3 patch, cudaStream_t stream)
-    {
+            Tex_I<cn, T>::bindTexture_(I);
-        typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+            Tex_J<cn, T>::bindTexture_(J);
-                               int level, dim3 block, cudaStream_t stream);
+            funcs[patch.y - 1][patch.x - 1](I, J, I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
-        static const func_t funcs[5][5] =
+                level, block, stream);
+        }
+        static void dense(PtrStepSzb I, PtrStepSz<T> J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream)
        {
-            {sparse_caller<4, 1, 1>, sparse_caller<4, 2, 1>, sparse_caller<4, 3, 1>, sparse_caller<4, 4, 1>, sparse_caller<4, 5, 1>},
+            dim3 block(16, 16);
-            {sparse_caller<4, 1, 2>, sparse_caller<4, 2, 2>, sparse_caller<4, 3, 2>, sparse_caller<4, 4, 2>, sparse_caller<4, 5, 2>},
+            dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
-            {sparse_caller<4, 1, 3>, sparse_caller<4, 2, 3>, sparse_caller<4, 3, 3>, sparse_caller<4, 4, 3>, sparse_caller<4, 5, 3>},
+            Tex_I<1, uchar>::bindTexture_(I);
-            {sparse_caller<4, 1, 4>, sparse_caller<4, 2, 4>, sparse_caller<4, 3, 4>, sparse_caller<4, 4, 4>, sparse_caller<4, 5, 4>},
+            Tex_J<1, T>::bindTexture_(J);
-            {sparse_caller<4, 1, 5>, sparse_caller<4, 2, 5>, sparse_caller<4, 3, 5>, sparse_caller<4, 4, 5>, sparse_caller<4, 5, 5>}
-        };
-        bindTexture(&tex_If4, I);
-        bindTexture(&tex_Jf4, J);
-        funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
+            int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
-            level, block, stream);
+            const int patchWidth = block.x + 2 * halfWin.x;
-    }
+            const int patchHeight = block.y + 2 * halfWin.y;
+            size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int);
-    void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream)
+            if (err.data)
-    {
+            {
-        dim3 block(16, 16);
+                denseKernel<true> << <grid, block, smem_size, stream >> >(u, v, prevU, prevV, err, I.rows, I.cols);
-        dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
+                cudaSafeCall(cudaGetLastError());
+            }
+            else
+            {
+                denseKernel<false> << <grid, block, smem_size, stream >> >(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
+                cudaSafeCall(cudaGetLastError());
+            }
-        bindTexture(&tex_Ib, I);
+            if (stream == 0)
-        bindTexture(&tex_Jf, J);
+                cudaSafeCall(cudaDeviceSynchronize());
+        }
+    };
-        int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
+    template class pyrLK_caller<unsigned char,1>;
-        const int patchWidth  = block.x + 2 * halfWin.x;
+    template class pyrLK_caller<unsigned short,1>;
-        const int patchHeight = block.y + 2 * halfWin.y;
+    template class pyrLK_caller<int,1>;
-        size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int);
+    template class pyrLK_caller<float,1>;
-        if (err.data)
+    template class pyrLK_caller<unsigned char, 3>;
-        {
+    template class pyrLK_caller<unsigned short, 3>;
-            denseKernel<true><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, err, I.rows, I.cols);
+    template class pyrLK_caller<int, 3>;
-            cudaSafeCall( cudaGetLastError() );
+    template class pyrLK_caller<float, 3>;
-        }
-        else
-        {
-            denseKernel<false><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
-            cudaSafeCall( cudaGetLastError() );
-        }
-        if (stream == 0)
+    template class pyrLK_caller<unsigned char, 4>;
-            cudaSafeCall( cudaDeviceSynchronize() );
+    template class pyrLK_caller<unsigned short, 4>;
-    }
+    template class pyrLK_caller<int, 4>;
+    template class pyrLK_caller<float, 4>;
 }
 #endif /* CUDA_DISABLER */
\ No newline at end of file
--- a/modules/cudaoptflow/src/precomp.hpp
+++ b/modules/cudaoptflow/src/precomp.hpp
@@ -52,7 +52,7 @@
 #include "opencv2/video.hpp"
 #include "opencv2/core/private.cuda.hpp"
+#include "opencv2/core/cuda/vec_traits.hpp"
 #include "opencv2/opencv_modules.hpp"
 #ifdef HAVE_OPENCV_CUDALEGACY

--- a/modules/cudaoptflow/src/pyrlk.cpp
+++ b/modules/cudaoptflow/src/pyrlk.cpp
@@ -56,14 +56,20 @@ Ptr<DensePyrLKOpticalFlow> cv::cuda::DensePyrLKOpticalFlow::create(Size, int, in
 namespace pyrlk
 {
    void loadConstants(int2 winSize, int iters, cudaStream_t stream);
+    template<typename T, int cn> struct pyrLK_caller
+    {
+        static void sparse(PtrStepSz<typename device::TypeVec<T, cn>::vec_type> I, PtrStepSz<typename device::TypeVec<T, cn>::vec_type> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+            int level, dim3 block, dim3 patch, cudaStream_t stream);
-    void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+        static void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
-                 int level, dim3 block, dim3 patch, cudaStream_t stream);
+            PtrStepSzf err, int2 winSize, cudaStream_t stream);
-    void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+    };
-                 int level, dim3 block, dim3 patch, cudaStream_t stream);
-    void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
+    template<typename T, int cn> void dispatcher(GpuMat I, GpuMat J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-               PtrStepSzf err, int2 winSize, cudaStream_t stream);
+        int level, dim3 block, dim3 patch, cudaStream_t stream)
+    {
+        pyrLK_caller<T, cn>::sparse(I, J, prevPts, nextPts, status, err, ptcount, level, block, patch, stream);
+    }
 }
 namespace
@@ -76,6 +82,9 @@ namespace
        void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
            GpuMat& status, GpuMat* err, Stream& stream);
+        void sparse(std::vector<GpuMat>& prevPyr, std::vector<GpuMat>& nextPyr, const GpuMat& prevPts, GpuMat& nextPts,
+            GpuMat& status, GpuMat* err, Stream& stream);
        void dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, Stream& stream);
    protected:
@@ -83,8 +92,9 @@ namespace
        int maxLevel_;
        int iters_;
        bool useInitialFlow_;
+        void buildImagePyramid(const GpuMat& prevImg, std::vector<GpuMat>& prevPyr, const GpuMat& nextImg, std::vector<GpuMat>& nextPyr, Stream stream);
    private:
+        friend class SparsePyrLKOpticalFlowImpl;
        std::vector<GpuMat> prevPyr_;
        std::vector<GpuMat> nextPyr_;
    };
@@ -113,28 +123,34 @@ namespace
        block.z = patch.z = 1;
    }
-    void PyrLKOpticalFlowBase::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err, Stream& stream)
+    void PyrLKOpticalFlowBase::buildImagePyramid(const GpuMat& prevImg, std::vector<GpuMat>& prevPyr, const GpuMat& nextImg, std::vector<GpuMat>& nextPyr, Stream stream)
    {
-        if (prevPts.empty())
+        prevPyr.resize(maxLevel_ + 1);
-        {
+        nextPyr.resize(maxLevel_ + 1);
-            nextPts.release();
-            status.release();
-            if (err) err->release();
-            return;
-        }
-        dim3 block, patch;
+        int cn = prevImg.channels();
-        calcPatchSize(winSize_, block, patch);
-        CV_Assert( prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4 );
+        CV_Assert(cn == 1 || cn == 3 || cn == 4);
-        CV_Assert( prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type() );
-        CV_Assert( maxLevel_ >= 0 );
+        prevPyr[0] = prevImg;
-        CV_Assert( winSize_.width > 2 && winSize_.height > 2 );
+        nextPyr[0] = nextImg;
-        CV_Assert( patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6 );
-        CV_Assert( prevPts.rows == 1 && prevPts.type() == CV_32FC2 );
+        for (int level = 1; level <= maxLevel_; ++level)
+        {
+            cuda::pyrDown(prevPyr[level - 1], prevPyr[level], stream);
+            cuda::pyrDown(nextPyr[level - 1], nextPyr[level], stream);
+        }
+    }
+    void PyrLKOpticalFlowBase::sparse(std::vector<GpuMat>& prevPyr, std::vector<GpuMat>& nextPyr, const GpuMat& prevPts, GpuMat& nextPts,
+        GpuMat& status, GpuMat* err, Stream& stream)
+    {
+        CV_Assert(prevPyr.size() && nextPyr.size() && "Pyramid needs to at least contain the original matrix as the first element");
+        CV_Assert(prevPyr[0].size() == nextPyr[0].size());
+        CV_Assert(prevPts.rows == 1 && prevPts.type() == CV_32FC2);
+        CV_Assert(maxLevel_ >= 0);
+        CV_Assert(winSize_.width > 2 && winSize_.height > 2);
        if (useInitialFlow_)
-            CV_Assert( nextPts.size() == prevPts.size() && nextPts.type() == prevPts.type() );
+            CV_Assert(nextPts.size() == prevPts.size() && nextPts.type() == prevPts.type());
        else
            ensureSizeIsEnough(1, prevPts.cols, prevPts.type(), nextPts);
@@ -142,66 +158,70 @@ namespace
        GpuMat temp2 = nextPts.reshape(1);
        cuda::multiply(temp1, Scalar::all(1.0 / (1 << maxLevel_) / 2.0), temp2, 1, -1, stream);
        ensureSizeIsEnough(1, prevPts.cols, CV_8UC1, status);
        status.setTo(Scalar::all(1), stream);
        if (err)
            ensureSizeIsEnough(1, prevPts.cols, CV_32FC1, *err);
-        // build the image pyramids.
+        if (prevPyr.size() != size_t(maxLevel_ + 1) || nextPyr.size() != size_t(maxLevel_ + 1))
+        {
+            buildImagePyramid(prevPyr[0], prevPyr, nextPyr[0], nextPyr, stream);
+        }
-        BufferPool pool(stream);
+        dim3 block, patch;
+        calcPatchSize(winSize_, block, patch);
+        CV_Assert(patch.x > 0 && patch.x < 6 && patch.y > 0 && patch.y < 6);
+        pyrlk::loadConstants(make_int2(winSize_.width, winSize_.height), iters_, StreamAccessor::getStream(stream));
-        prevPyr_.resize(maxLevel_ + 1);
+        const int cn = prevPyr[0].channels();
-        nextPyr_.resize(maxLevel_ + 1);
+        const int type = prevPyr[0].depth();
-        int cn = prevImg.channels();
+        typedef void(*func_t)(GpuMat I, GpuMat J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+            int level, dim3 block, dim3 patch, cudaStream_t stream);
-        if (cn == 1 || cn == 4)
+        // Current int datatype is disabled due to pyrDown not implementing it
+        // while ushort does work, it has significantly worse performance, and thus doesn't pass accuracy tests.
+        static const func_t funcs[6][4] =
        {
-            prevImg.convertTo(prevPyr_[0], CV_32F, stream);
+          {   pyrlk::dispatcher<uchar, 1>     , /*pyrlk::dispatcher<uchar, 2>*/ 0, pyrlk::dispatcher<uchar, 3>     ,   pyrlk::dispatcher<uchar, 4>    },
-            nextImg.convertTo(nextPyr_[0], CV_32F, stream);
+          { /*pyrlk::dispatcher<char, 1>*/   0, /*pyrlk::dispatcher<char, 2>*/  0, /*pyrlk::dispatcher<char, 3>*/  0, /*pyrlk::dispatcher<char, 4>*/ 0 },
-        }
+          { pyrlk::dispatcher<ushort, 1>      , /*pyrlk::dispatcher<ushort, 2>*/0, pyrlk::dispatcher<ushort, 3>     ,   pyrlk::dispatcher<ushort, 4>   },
-        else
+          { /*pyrlk::dispatcher<short, 1>*/  0, /*pyrlk::dispatcher<short, 2>*/ 0, /*pyrlk::dispatcher<short, 3>*/ 0, /*pyrlk::dispatcher<short, 4>*/0 },
-        {
+          {   pyrlk::dispatcher<int, 1>       , /*pyrlk::dispatcher<int, 2>*/   0, pyrlk::dispatcher<int, 3>        ,   pyrlk::dispatcher<int, 4>      },
-            GpuMat buf = pool.getBuffer(prevImg.size(), CV_MAKE_TYPE(prevImg.depth(), 4));
+          {   pyrlk::dispatcher<float, 1>     , /*pyrlk::dispatcher<float, 2>*/ 0, pyrlk::dispatcher<float, 3>      ,   pyrlk::dispatcher<float, 4>    }
+        };
-            cuda::cvtColor(prevImg, buf, COLOR_BGR2BGRA, 0, stream);
-            buf.convertTo(prevPyr_[0], CV_32F, stream);
-            cuda::cvtColor(nextImg, buf, COLOR_BGR2BGRA, 0, stream);
+        func_t func = funcs[type][cn-1];
-            buf.convertTo(nextPyr_[0], CV_32F, stream);
+        CV_Assert(func != NULL && "Datatype not implemented");
+        for (int level = maxLevel_; level >= 0; level--)
+        {
+            func(prevPyr[level], nextPyr[level],
+                prevPts.ptr<float2>(), nextPts.ptr<float2>(),
+                status.ptr(), level == 0 && err ? err->ptr<float>() : 0,
+                prevPts.cols, level, block, patch,
+                StreamAccessor::getStream(stream));
        }
+    }
-        for (int level = 1; level <= maxLevel_; ++level)
+    void PyrLKOpticalFlowBase::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err, Stream& stream)
+    {
+        if (prevPts.empty())
        {
-            cuda::pyrDown(prevPyr_[level - 1], prevPyr_[level], stream);
+            nextPts.release();
-            cuda::pyrDown(nextPyr_[level - 1], nextPyr_[level], stream);
+            status.release();
+            if (err) err->release();
+            return;
        }
+        CV_Assert( prevImg.channels() == 1 || prevImg.channels() == 3 || prevImg.channels() == 4 );
+        CV_Assert( prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type() );
-        pyrlk::loadConstants(make_int2(winSize_.width, winSize_.height), iters_, StreamAccessor::getStream(stream));
+        // build the image pyramids.
+        buildImagePyramid(prevImg, prevPyr_, nextImg, nextPyr_, stream);
+        sparse(prevPyr_, nextPyr_, prevPts, nextPts, status, err, stream);
-        for (int level = maxLevel_; level >= 0; level--)
-        {
-            if (cn == 1)
-            {
-                pyrlk::sparse1(prevPyr_[level], nextPyr_[level],
-                               prevPts.ptr<float2>(), nextPts.ptr<float2>(),
-                               status.ptr(),
-                               level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
-                               level, block, patch,
-                               StreamAccessor::getStream(stream));
-            }
-            else
-            {
-                pyrlk::sparse4(prevPyr_[level], nextPyr_[level],
-                               prevPts.ptr<float2>(), nextPts.ptr<float2>(),
-                               status.ptr(),
-                               level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
-                               level, block, patch,
-                               StreamAccessor::getStream(stream));
-            }
-        }
    }
    void PyrLKOpticalFlowBase::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, Stream& stream)
@@ -250,7 +270,7 @@ namespace
        {
            int idx2 = (idx + 1) & 1;
-            pyrlk::dense(prevPyr_[level], nextPyr_[level],
+            pyrlk::pyrLK_caller<float,1>::dense(prevPyr_[level], nextPyr_[level],
                         uPyr[idx], vPyr[idx], uPyr[idx2], vPyr[idx2],
                         PtrStepSzf(), winSize2i,
                         StreamAccessor::getStream(stream));
@@ -289,14 +309,23 @@ namespace
                          OutputArray _err,
                          Stream& stream)
        {
-            const GpuMat prevImg = _prevImg.getGpuMat();
-            const GpuMat nextImg = _nextImg.getGpuMat();
            const GpuMat prevPts = _prevPts.getGpuMat();
            GpuMat& nextPts = _nextPts.getGpuMatRef();
            GpuMat& status = _status.getGpuMatRef();
            GpuMat* err = _err.needed() ? &(_err.getGpuMatRef()) : NULL;
+            if (_prevImg.kind() == _InputArray::STD_VECTOR_CUDA_GPU_MAT && _prevImg.kind() == _InputArray::STD_VECTOR_CUDA_GPU_MAT)
-            sparse(prevImg, nextImg, prevPts, nextPts, status, err, stream);
+            {
+                std::vector<GpuMat> prevPyr, nextPyr;
+                _prevImg.getGpuMatVector(prevPyr);
+                _nextImg.getGpuMatVector(nextPyr);
+                sparse(prevPyr, nextPyr, prevPts, nextPts, status, err, stream);
+            }
+            else
+            {
+                const GpuMat prevImg = _prevImg.getGpuMat();
+                const GpuMat nextImg = _nextImg.getGpuMat();
+                sparse(prevImg, nextImg, prevPts, nextPts, status, err, stream);
+            }
        }
    };
@@ -347,4 +376,4 @@ Ptr<DensePyrLKOpticalFlow> cv::cuda::DensePyrLKOpticalFlow::create(Size winSize,
    return makePtr<DensePyrLKOpticalFlowImpl>(winSize, maxLevel, iters, useInitialFlow);
 }
 #endif /* !defined (HAVE_CUDA) */
\ No newline at end of file
--- a/modules/cudaoptflow/test/test_optflow.cpp
+++ b/modules/cudaoptflow/test/test_optflow.cpp
@@ -167,33 +167,34 @@ INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, BroxOpticalFlow, ALL_DEVICES);
 namespace
 {
-    IMPLEMENT_PARAM_CLASS(UseGray, bool)
+    IMPLEMENT_PARAM_CLASS(Chan, int)
+    IMPLEMENT_PARAM_CLASS(DataType, int)
 }
-PARAM_TEST_CASE(PyrLKOpticalFlow, cv::cuda::DeviceInfo, UseGray)
+PARAM_TEST_CASE(PyrLKOpticalFlow, cv::cuda::DeviceInfo, Chan, DataType)
 {
    cv::cuda::DeviceInfo devInfo;
-    bool useGray;
+    int channels;
+    int dataType;
    virtual void SetUp()
    {
        devInfo = GET_PARAM(0);
-        useGray = GET_PARAM(1);
+        channels = GET_PARAM(1);
+        dataType = GET_PARAM(2);
        cv::cuda::setDevice(devInfo.deviceID());
    }
 };
 CUDA_TEST_P(PyrLKOpticalFlow, Sparse)
 {
-    cv::Mat frame0 = readImage("opticalflow/frame0.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
+    cv::Mat frame0 = readImage("opticalflow/frame0.png", channels == 1 ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
    ASSERT_FALSE(frame0.empty());
-    cv::Mat frame1 = readImage("opticalflow/frame1.png", useGray ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
+    cv::Mat frame1 = readImage("opticalflow/frame1.png", channels == 1 ? cv::IMREAD_GRAYSCALE : cv::IMREAD_COLOR);
    ASSERT_FALSE(frame1.empty());
    cv::Mat gray_frame;
-    if (useGray)
+    if (channels == 1)
        gray_frame = frame0;
    else
        cv::cvtColor(frame0, gray_frame, cv::COLOR_BGR2GRAY);
@@ -208,22 +209,32 @@ CUDA_TEST_P(PyrLKOpticalFlow, Sparse)
    cv::Ptr<cv::cuda::SparsePyrLKOpticalFlow> pyrLK =
            cv::cuda::SparsePyrLKOpticalFlow::create();
+    std::vector<cv::Point2f> nextPts_gold;
+    std::vector<unsigned char> status_gold;
+    cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts_gold, status_gold, cv::noArray());
    cv::cuda::GpuMat d_nextPts;
    cv::cuda::GpuMat d_status;
-    pyrLK->calc(loadMat(frame0), loadMat(frame1), d_pts, d_nextPts, d_status);
+    cv::Mat converted0, converted1;
+    if(channels == 4)
+    {
+        cv::cvtColor(frame0, frame0, cv::COLOR_BGR2BGRA);
+        cv::cvtColor(frame1, frame1, cv::COLOR_BGR2BGRA);
+    }
+    frame0.convertTo(converted0, dataType);
+    frame1.convertTo(converted1, dataType);
+    pyrLK->calc(loadMat(converted0), loadMat(converted1), d_pts, d_nextPts, d_status);
    std::vector<cv::Point2f> nextPts(d_nextPts.cols);
-    cv::Mat nextPts_mat(1, d_nextPts.cols, CV_32FC2, (void*) &nextPts[0]);
+    cv::Mat nextPts_mat(1, d_nextPts.cols, CV_32FC2, (void*)&nextPts[0]);
    d_nextPts.download(nextPts_mat);
    std::vector<unsigned char> status(d_status.cols);
-    cv::Mat status_mat(1, d_status.cols, CV_8UC1, (void*) &status[0]);
+    cv::Mat status_mat(1, d_status.cols, CV_8UC1, (void*)&status[0]);
    d_status.download(status_mat);
-    std::vector<cv::Point2f> nextPts_gold;
-    std::vector<unsigned char> status_gold;
-    cv::calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts_gold, status_gold, cv::noArray());
    ASSERT_EQ(nextPts_gold.size(), nextPts.size());
    ASSERT_EQ(status_gold.size(), status.size());
@@ -251,11 +262,16 @@ CUDA_TEST_P(PyrLKOpticalFlow, Sparse)
    double bad_ratio = static_cast<double>(mistmatch) / nextPts.size();
    ASSERT_LE(bad_ratio, 0.01);
 }
 INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, PyrLKOpticalFlow, testing::Combine(
    ALL_DEVICES,
-    testing::Values(UseGray(true), UseGray(false))));
+    testing::Values(Chan(1), Chan(3), Chan(4)),
+    testing::Values(DataType(CV_8U), DataType(CV_16U), DataType(CV_32S), DataType(CV_32F))));
 //////////////////////////////////////////////////////
 // FarnebackOpticalFlow
@@ -385,4 +401,4 @@ INSTANTIATE_TEST_CASE_P(CUDA_OptFlow, OpticalFlowDual_TVL1, testing::Combine(
    ALL_DEVICES,
    testing::Values(Gamma(0.0), Gamma(1.0))));
 #endif // HAVE_CUDA
\ No newline at end of file
--- a/modules/cudawarping/src/cuda/pyr_down.cu
+++ b/modules/cudawarping/src/cuda/pyr_down.cu
@@ -212,10 +212,10 @@ namespace cv { namespace cuda { namespace device
        template void pyrDown_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        template void pyrDown_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        //template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
-        //template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
+        template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
        //template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
@@ -225,4 +225,4 @@ namespace cv { namespace cuda { namespace device
 }}} // namespace cv { namespace cuda { namespace cudev
 #endif /* CUDA_DISABLER */
\ No newline at end of file
--- a/modules/cudawarping/src/pyramids.cpp
+++ b/modules/cudawarping/src/pyramids.cpp
@@ -74,7 +74,7 @@ void cv::cuda::pyrDown(InputArray _src, OutputArray _dst, Stream& stream)
        {0 /*pyrDown_gpu<schar>*/, 0 /*pyrDown_gpu<schar2>*/ , 0 /*pyrDown_gpu<schar3>*/, 0 /*pyrDown_gpu<schar4>*/},
        {pyrDown_gpu<ushort>     , 0 /*pyrDown_gpu<ushort2>*/, pyrDown_gpu<ushort3>     , pyrDown_gpu<ushort4>     },
        {pyrDown_gpu<short>      , 0 /*pyrDown_gpu<short2>*/ , pyrDown_gpu<short3>      , pyrDown_gpu<short4>      },
-        {0 /*pyrDown_gpu<int>*/  , 0 /*pyrDown_gpu<int2>*/   , 0 /*pyrDown_gpu<int3>*/  , 0 /*pyrDown_gpu<int4>*/  },
+        {pyrDown_gpu<int>        , 0 /*pyrDown_gpu<int2>*/   , pyrDown_gpu<int3>        , pyrDown_gpu<int4>        },
        {pyrDown_gpu<float>      , 0 /*pyrDown_gpu<float2>*/ , pyrDown_gpu<float3>      , pyrDown_gpu<float4>      }
    };
@@ -131,4 +131,4 @@ void cv::cuda::pyrUp(InputArray _src, OutputArray _dst, Stream& stream)
    func(src, dst, StreamAccessor::getStream(stream));
 }
 #endif
\ No newline at end of file