PyrLKOpticalFlow

1f1e24be · Vladislav Vinogradov · 1b571bde · 1f1e24be · 1f1e24be
Commit 1f1e24be authored Nov 12, 2012 by Vladislav Vinogradov
Show whitespace changes
Inline Side-by-side

Showing with 116 additions and 239 deletions

pyrlk.cu modules/gpu/src/cuda/pyrlk.cu +106 -222

pyrlk.cpp modules/gpu/src/pyrlk.cpp +10 -17

No files found.
--- a/modules/gpu/src/cuda/pyrlk.cu
+++ b/modules/gpu/src/cuda/pyrlk.cu
@@ -52,167 +52,19 @@
 #include "opencv2/gpu/device/functional.hpp"
 #include "opencv2/gpu/device/limits.hpp"
 #include "opencv2/gpu/device/vec_math.hpp"
+#include "opencv2/gpu/device/reduce.hpp"
-namespace cv { namespace gpu { namespace device
+using namespace cv::gpu;
+using namespace cv::gpu::device;
+namespace
 {
-    namespace pyrlk
-    {
    __constant__ int c_winSize_x;
    __constant__ int c_winSize_y;
    __constant__ int c_halfWin_x;
    __constant__ int c_halfWin_y;
    __constant__ int c_iters;
-        void loadConstants(int2 winSize, int iters)
-        {
-            cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_winSize_y, &winSize.y, sizeof(int)) );
-            int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
-            cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_x, &halfWin.x, sizeof(int)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_y, &halfWin.y, sizeof(int)) );
-            cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) );
-        }
-        __device__ void reduce(float& val1, float& val2, float& val3, float* smem1, float* smem2, float* smem3, int tid)
-        {
-            smem1[tid] = val1;
-            smem2[tid] = val2;
-            smem3[tid] = val3;
-            __syncthreads();
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110)
-            if (tid < 128)
-            {
-                smem1[tid] = val1 += smem1[tid + 128];
-                smem2[tid] = val2 += smem2[tid + 128];
-                smem3[tid] = val3 += smem3[tid + 128];
-            }
-            __syncthreads();
-#endif
-            if (tid < 64)
-            {
-                smem1[tid] = val1 += smem1[tid + 64];
-                smem2[tid] = val2 += smem2[tid + 64];
-                smem3[tid] = val3 += smem3[tid + 64];
-            }
-            __syncthreads();
-            if (tid < 32)
-            {
-                volatile float* vmem1 = smem1;
-                volatile float* vmem2 = smem2;
-                volatile float* vmem3 = smem3;
-                vmem1[tid] = val1 += vmem1[tid + 32];
-                vmem2[tid] = val2 += vmem2[tid + 32];
-                vmem3[tid] = val3 += vmem3[tid + 32];
-                vmem1[tid] = val1 += vmem1[tid + 16];
-                vmem2[tid] = val2 += vmem2[tid + 16];
-                vmem3[tid] = val3 += vmem3[tid + 16];
-                vmem1[tid] = val1 += vmem1[tid + 8];
-                vmem2[tid] = val2 += vmem2[tid + 8];
-                vmem3[tid] = val3 += vmem3[tid + 8];
-                vmem1[tid] = val1 += vmem1[tid + 4];
-                vmem2[tid] = val2 += vmem2[tid + 4];
-                vmem3[tid] = val3 += vmem3[tid + 4];
-                vmem1[tid] = val1 += vmem1[tid + 2];
-                vmem2[tid] = val2 += vmem2[tid + 2];
-                vmem3[tid] = val3 += vmem3[tid + 2];
-                vmem1[tid] = val1 += vmem1[tid + 1];
-                vmem2[tid] = val2 += vmem2[tid + 1];
-                vmem3[tid] = val3 += vmem3[tid + 1];
-            }
-        }
-        __device__ void reduce(float& val1, float& val2, float* smem1, float* smem2, int tid)
-        {
-            smem1[tid] = val1;
-            smem2[tid] = val2;
-            __syncthreads();
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110)
-            if (tid < 128)
-            {
-                smem1[tid] = val1 += smem1[tid + 128];
-                smem2[tid] = val2 += smem2[tid + 128];
-            }
-            __syncthreads();
-#endif
-            if (tid < 64)
-            {
-                smem1[tid] = val1 += smem1[tid + 64];
-                smem2[tid] = val2 += smem2[tid + 64];
-            }
-            __syncthreads();
-            if (tid < 32)
-            {
-                volatile float* vmem1 = smem1;
-                volatile float* vmem2 = smem2;
-                vmem1[tid] = val1 += vmem1[tid + 32];
-                vmem2[tid] = val2 += vmem2[tid + 32];
-                vmem1[tid] = val1 += vmem1[tid + 16];
-                vmem2[tid] = val2 += vmem2[tid + 16];
-                vmem1[tid] = val1 += vmem1[tid + 8];
-                vmem2[tid] = val2 += vmem2[tid + 8];
-                vmem1[tid] = val1 += vmem1[tid + 4];
-                vmem2[tid] = val2 += vmem2[tid + 4];
-                vmem1[tid] = val1 += vmem1[tid + 2];
-                vmem2[tid] = val2 += vmem2[tid + 2];
-                vmem1[tid] = val1 += vmem1[tid + 1];
-                vmem2[tid] = val2 += vmem2[tid + 1];
-            }
-        }
-        __device__ void reduce(float& val1, float* smem1, int tid)
-        {
-            smem1[tid] = val1;
-            __syncthreads();
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ > 110)
-            if (tid < 128)
-            {
-                smem1[tid] = val1 += smem1[tid + 128];
-            }
-            __syncthreads();
-#endif
-            if (tid < 64)
-            {
-                smem1[tid] = val1 += smem1[tid + 64];
-            }
-            __syncthreads();
-            if (tid < 32)
-            {
-                volatile float* vmem1 = smem1;
-                vmem1[tid] = val1 += vmem1[tid + 32];
-                vmem1[tid] = val1 += vmem1[tid + 16];
-                vmem1[tid] = val1 += vmem1[tid + 8];
-                vmem1[tid] = val1 += vmem1[tid + 4];
-                vmem1[tid] = val1 += vmem1[tid + 2];
-                vmem1[tid] = val1 += vmem1[tid + 1];
-            }
-        }
    texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp);
    texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp);
    texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp);
@@ -263,7 +115,7 @@ namespace cv { namespace gpu { namespace device
    __device__ __forceinline__ float abs_(float a)
    {
-            return ::fabs(a);
+        return ::fabsf(a);
    }
    __device__ __forceinline__ float4 abs_(const float4& a)
    {
@@ -271,19 +123,19 @@ namespace cv { namespace gpu { namespace device
    }
    template <int cn, int PATCH_X, int PATCH_Y, bool calcErr>
-        __global__ void lkSparse(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
+    __global__ void sparse(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
    {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ <= 110)
+    #if __CUDA_ARCH__ <= 110
-            __shared__ float smem1[128];
+        const int BLOCK_SIZE = 128;
-            __shared__ float smem2[128];
+    #else
-            __shared__ float smem3[128];
+        const int BLOCK_SIZE = 256;
-#else
+    #endif
-            __shared__ float smem1[256];
-            __shared__ float smem2[256];
+        __shared__ float smem1[BLOCK_SIZE];
-            __shared__ float smem3[256];
+        __shared__ float smem2[BLOCK_SIZE];
-#endif
+        __shared__ float smem3[BLOCK_SIZE];
-            const int tid = threadIdx.y * blockDim.x + threadIdx.x;
+        const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
        float2 prevPt = prevPts[blockIdx.x];
        prevPt.x *= (1.0f / (1 << level));
@@ -338,7 +190,17 @@ namespace cv { namespace gpu { namespace device
            }
        }
-            reduce(A11, A12, A22, smem1, smem2, smem3, tid);
+        reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2, smem3), thrust::tie(A11, A12, A22), tid, thrust::make_tuple(plus<float>(), plus<float>(), plus<float>()));
+    #if __CUDA_ARCH__ >= 300
+        if (tid == 0)
+        {
+            smem1[0] = A11;
+            smem2[0] = A12;
+            smem3[0] = A22;
+        }
+    #endif
        __syncthreads();
        A11 = smem1[0];
@@ -395,7 +257,16 @@ namespace cv { namespace gpu { namespace device
                }
            }
-                reduce(b1, b2, smem1, smem2, tid);
+            reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2), thrust::tie(b1, b2), tid, thrust::make_tuple(plus<float>(), plus<float>()));
+        #if __CUDA_ARCH__ >= 300
+            if (tid == 0)
+            {
+                smem1[0] = b1;
+                smem2[0] = b2;
+            }
+        #endif
            __syncthreads();
            b1 = smem1[0];
@@ -428,7 +299,7 @@ namespace cv { namespace gpu { namespace device
                }
            }
-                reduce(errval, smem1, tid);
+            reduce<BLOCK_SIZE>(smem1, errval, tid, plus<float>());
        }
        if (tid == 0)
@@ -444,15 +315,15 @@ namespace cv { namespace gpu { namespace device
    }
    template <int cn, int PATCH_X, int PATCH_Y>
-        void lkSparse_caller(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+    void sparse_caller(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
                       int level, dim3 block, cudaStream_t stream)
    {
        dim3 grid(ptcount);
        if (level == 0 && err)
-                lkSparse<cn, PATCH_X, PATCH_Y, true><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
+            sparse<cn, PATCH_X, PATCH_Y, true><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
        else
-                lkSparse<cn, PATCH_X, PATCH_Y, false><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
+            sparse<cn, PATCH_X, PATCH_Y, false><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
        cudaSafeCall( cudaGetLastError() );
@@ -460,52 +331,8 @@ namespace cv { namespace gpu { namespace device
            cudaSafeCall( cudaDeviceSynchronize() );
    }
-        void lkSparse1_gpu(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-            int level, dim3 block, dim3 patch, cudaStream_t stream)
-        {
-            typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                int level, dim3 block, cudaStream_t stream);
-            static const func_t funcs[5][5] =
-            {
-                {lkSparse_caller<1, 1, 1>, lkSparse_caller<1, 2, 1>, lkSparse_caller<1, 3, 1>, lkSparse_caller<1, 4, 1>, lkSparse_caller<1, 5, 1>},
-                {lkSparse_caller<1, 1, 2>, lkSparse_caller<1, 2, 2>, lkSparse_caller<1, 3, 2>, lkSparse_caller<1, 4, 2>, lkSparse_caller<1, 5, 2>},
-                {lkSparse_caller<1, 1, 3>, lkSparse_caller<1, 2, 3>, lkSparse_caller<1, 3, 3>, lkSparse_caller<1, 4, 3>, lkSparse_caller<1, 5, 3>},
-                {lkSparse_caller<1, 1, 4>, lkSparse_caller<1, 2, 4>, lkSparse_caller<1, 3, 4>, lkSparse_caller<1, 4, 4>, lkSparse_caller<1, 5, 4>},
-                {lkSparse_caller<1, 1, 5>, lkSparse_caller<1, 2, 5>, lkSparse_caller<1, 3, 5>, lkSparse_caller<1, 4, 5>, lkSparse_caller<1, 5, 5>}
-            };
-            bindTexture(&tex_If, I);
-            bindTexture(&tex_Jf, J);
-            funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
-                level, block, stream);
-        }
-        void lkSparse4_gpu(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-            int level, dim3 block, dim3 patch, cudaStream_t stream)
-        {
-            typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
-                int level, dim3 block, cudaStream_t stream);
-            static const func_t funcs[5][5] =
-            {
-                {lkSparse_caller<4, 1, 1>, lkSparse_caller<4, 2, 1>, lkSparse_caller<4, 3, 1>, lkSparse_caller<4, 4, 1>, lkSparse_caller<4, 5, 1>},
-                {lkSparse_caller<4, 1, 2>, lkSparse_caller<4, 2, 2>, lkSparse_caller<4, 3, 2>, lkSparse_caller<4, 4, 2>, lkSparse_caller<4, 5, 2>},
-                {lkSparse_caller<4, 1, 3>, lkSparse_caller<4, 2, 3>, lkSparse_caller<4, 3, 3>, lkSparse_caller<4, 4, 3>, lkSparse_caller<4, 5, 3>},
-                {lkSparse_caller<4, 1, 4>, lkSparse_caller<4, 2, 4>, lkSparse_caller<4, 3, 4>, lkSparse_caller<4, 4, 4>, lkSparse_caller<4, 5, 4>},
-                {lkSparse_caller<4, 1, 5>, lkSparse_caller<4, 2, 5>, lkSparse_caller<4, 3, 5>, lkSparse_caller<4, 4, 5>, lkSparse_caller<4, 5, 5>}
-            };
-            bindTexture(&tex_If4, I);
-            bindTexture(&tex_Jf4, J);
-            funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
-                level, block, stream);
-        }
    template <bool calcErr>
-        __global__ void lkDense(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
+    __global__ void dense(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
    {
        extern __shared__ int smem[];
@@ -649,9 +476,67 @@ namespace cv { namespace gpu { namespace device
            err(y, x) = static_cast<float>(errval) / (c_winSize_x * c_winSize_y);
        }
    }
+}
-        void lkDense_gpu(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
+namespace pyrlk
-                         PtrStepSzf err, int2 winSize, cudaStream_t stream)
+{
+    void loadConstants(int2 winSize, int iters)
+    {
+        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_y, &winSize.y, sizeof(int)) );
+        int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
+        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_x, &halfWin.x, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_y, &halfWin.y, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) );
+    }
+    void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                 int level, dim3 block, dim3 patch, cudaStream_t stream)
+    {
+        typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                               int level, dim3 block, cudaStream_t stream);
+        static const func_t funcs[5][5] =
+        {
+            {::sparse_caller<1, 1, 1>, ::sparse_caller<1, 2, 1>, ::sparse_caller<1, 3, 1>, ::sparse_caller<1, 4, 1>, ::sparse_caller<1, 5, 1>},
+            {::sparse_caller<1, 1, 2>, ::sparse_caller<1, 2, 2>, ::sparse_caller<1, 3, 2>, ::sparse_caller<1, 4, 2>, ::sparse_caller<1, 5, 2>},
+            {::sparse_caller<1, 1, 3>, ::sparse_caller<1, 2, 3>, ::sparse_caller<1, 3, 3>, ::sparse_caller<1, 4, 3>, ::sparse_caller<1, 5, 3>},
+            {::sparse_caller<1, 1, 4>, ::sparse_caller<1, 2, 4>, ::sparse_caller<1, 3, 4>, ::sparse_caller<1, 4, 4>, ::sparse_caller<1, 5, 4>},
+            {::sparse_caller<1, 1, 5>, ::sparse_caller<1, 2, 5>, ::sparse_caller<1, 3, 5>, ::sparse_caller<1, 4, 5>, ::sparse_caller<1, 5, 5>}
+        };
+        bindTexture(&tex_If, I);
+        bindTexture(&tex_Jf, J);
+        funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
+            level, block, stream);
+    }
+    void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                 int level, dim3 block, dim3 patch, cudaStream_t stream)
+    {
+        typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+                               int level, dim3 block, cudaStream_t stream);
+        static const func_t funcs[5][5] =
+        {
+            {::sparse_caller<4, 1, 1>, ::sparse_caller<4, 2, 1>, ::sparse_caller<4, 3, 1>, ::sparse_caller<4, 4, 1>, ::sparse_caller<4, 5, 1>},
+            {::sparse_caller<4, 1, 2>, ::sparse_caller<4, 2, 2>, ::sparse_caller<4, 3, 2>, ::sparse_caller<4, 4, 2>, ::sparse_caller<4, 5, 2>},
+            {::sparse_caller<4, 1, 3>, ::sparse_caller<4, 2, 3>, ::sparse_caller<4, 3, 3>, ::sparse_caller<4, 4, 3>, ::sparse_caller<4, 5, 3>},
+            {::sparse_caller<4, 1, 4>, ::sparse_caller<4, 2, 4>, ::sparse_caller<4, 3, 4>, ::sparse_caller<4, 4, 4>, ::sparse_caller<4, 5, 4>},
+            {::sparse_caller<4, 1, 5>, ::sparse_caller<4, 2, 5>, ::sparse_caller<4, 3, 5>, ::sparse_caller<4, 4, 5>, ::sparse_caller<4, 5, 5>}
+        };
+        bindTexture(&tex_If4, I);
+        bindTexture(&tex_Jf4, J);
+        funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
+            level, block, stream);
+    }
+    void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream)
    {
        dim3 block(16, 16);
        dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
@@ -666,19 +551,18 @@ namespace cv { namespace gpu { namespace device
        if (err.data)
        {
-                lkDense<true><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, err, I.rows, I.cols);
+            ::dense<true><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, err, I.rows, I.cols);
            cudaSafeCall( cudaGetLastError() );
        }
        else
        {
-                lkDense<false><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
+            ::dense<false><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
            cudaSafeCall( cudaGetLastError() );
        }
        if (stream == 0)
            cudaSafeCall( cudaDeviceSynchronize() );
    }
-    }
+}
-}}}
 #endif /* CUDA_DISABLER */
--- a/modules/gpu/src/pyrlk.cpp
+++ b/modules/gpu/src/pyrlk.cpp
@@ -55,21 +55,18 @@ void cv::gpu::PyrLKOpticalFlow::releaseMemory() {}
 #else /* !defined (HAVE_CUDA) */
-namespace cv { namespace gpu { namespace device
+namespace pyrlk
 {
-    namespace pyrlk
-    {
    void loadConstants(int2 winSize, int iters);
-        void lkSparse1_gpu(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+    void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
                 int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
-        void lkSparse4_gpu(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
+    void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
                 int level, dim3 block, dim3 patch, cudaStream_t stream = 0);
-        void lkDense_gpu(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
+    void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV,
               PtrStepSzf err, int2 winSize, cudaStream_t stream = 0);
-    }
+}
-}}}
 cv::gpu::PyrLKOpticalFlow::PyrLKOpticalFlow()
 {
@@ -104,8 +101,6 @@ namespace
 void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts, GpuMat& status, GpuMat* err)
 {
-    using namespace cv::gpu::device::pyrlk;
    if (prevPts.empty())
    {
        nextPts.release();
@@ -166,19 +161,19 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next
        pyrDown(nextPyr_[level - 1], nextPyr_[level]);
    }
-    loadConstants(make_int2(winSize.width, winSize.height), iters);
+    pyrlk::loadConstants(make_int2(winSize.width, winSize.height), iters);
    for (int level = maxLevel; level >= 0; level--)
    {
        if (cn == 1)
        {
-            lkSparse1_gpu(prevPyr_[level], nextPyr_[level],
+            pyrlk::sparse1(prevPyr_[level], nextPyr_[level],
                prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
                level, block, patch);
        }
        else
        {
-            lkSparse4_gpu(prevPyr_[level], nextPyr_[level],
+            pyrlk::sparse4(prevPyr_[level], nextPyr_[level],
                prevPts.ptr<float2>(), nextPts.ptr<float2>(), status.ptr(), level == 0 && err ? err->ptr<float>() : 0, prevPts.cols,
                level, block, patch);
        }
@@ -187,8 +182,6 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next
 void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextImg, GpuMat& u, GpuMat& v, GpuMat* err)
 {
-    using namespace cv::gpu::device::pyrlk;
    CV_Assert(prevImg.type() == CV_8UC1);
    CV_Assert(prevImg.size() == nextImg.size() && prevImg.type() == nextImg.type());
    CV_Assert(maxLevel >= 0);
@@ -219,7 +212,7 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI
    vPyr_[1].setTo(Scalar::all(0));
    int2 winSize2i = make_int2(winSize.width, winSize.height);
-    loadConstants(winSize2i, iters);
+    pyrlk::loadConstants(winSize2i, iters);
    PtrStepSzf derr = err ? *err : PtrStepSzf();
@@ -229,7 +222,7 @@ void cv::gpu::PyrLKOpticalFlow::dense(const GpuMat& prevImg, const GpuMat& nextI
    {
        int idx2 = (idx + 1) & 1;
-        lkDense_gpu(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2],
+        pyrlk::dense(prevPyr_[level], nextPyr_[level], uPyr_[idx], vPyr_[idx], uPyr_[idx2], vPyr_[idx2],
            level == 0 ? derr : PtrStepSzf(), winSize2i);
        if (level > 0)