Merge pull request #3635 from jet47:cuda-optflow-refactoring

3f1fb281 · Vadim Pisarevsky · 9c81338c · 71061703 · 3f1fb281 · 3f1fb281
Commit 3f1fb281 authored Jan 22, 2015 by Vadim Pisarevsky
27 changed files
--- a/modules/cudalegacy/include/opencv2/cudalegacy.hpp
+++ b/modules/cudalegacy/include/opencv2/cudalegacy.hpp
@@ -71,8 +71,9 @@ public:

 CV_EXPORTS Ptr<ImagePyramid> createImagePyramid(InputArray img, int nLayers = -1, Stream& stream = Stream::Null());

-////////////////////////////////////////////////////
+//
 // GMG
+//

 /** @brief Background/Foreground Segmentation Algorithm.

@@ -125,8 +126,9 @@ public:
 CV_EXPORTS Ptr<cuda::BackgroundSubtractorGMG>
    createBackgroundSubtractorGMG(int initializationFrames = 120, double decisionThreshold = 0.8);

-////////////////////////////////////////////////////
+//
 // FGD
+//

 /** @brief The class discriminates between foreground and background pixels by building and maintaining a model
 of the background.
@@ -180,6 +182,51 @@ struct CV_EXPORTS FGDParams
 CV_EXPORTS Ptr<cuda::BackgroundSubtractorFGD>
    createBackgroundSubtractorFGD(const FGDParams& params = FGDParams());

+//
+// Optical flow
+//
+
+//! Calculates optical flow for 2 images using block matching algorithm */
+CV_EXPORTS void calcOpticalFlowBM(const GpuMat& prev, const GpuMat& curr,
+                                  Size block_size, Size shift_size, Size max_range, bool use_previous,
+                                  GpuMat& velx, GpuMat& vely, GpuMat& buf,
+                                  Stream& stream = Stream::Null());
+
+class CV_EXPORTS FastOpticalFlowBM
+{
+public:
+    void operator ()(const GpuMat& I0, const GpuMat& I1, GpuMat& flowx, GpuMat& flowy, int search_window = 21, int block_window = 7, Stream& s = Stream::Null());
+
+private:
+    GpuMat buffer;
+    GpuMat extended_I0;
+    GpuMat extended_I1;
+};
+
+/** @brief Interpolates frames (images) using provided optical flow (displacement field).
+
+@param frame0 First frame (32-bit floating point images, single channel).
+@param frame1 Second frame. Must have the same type and size as frame0 .
+@param fu Forward horizontal displacement.
+@param fv Forward vertical displacement.
+@param bu Backward horizontal displacement.
+@param bv Backward vertical displacement.
+@param pos New frame position.
+@param newFrame Output image.
+@param buf Temporary buffer, will have width x 6\*height size, CV_32FC1 type and contain 6
+GpuMat: occlusion masks for first frame, occlusion masks for second, interpolated forward
+horizontal flow, interpolated forward vertical flow, interpolated backward horizontal flow,
+interpolated backward vertical flow.
+@param stream Stream for the asynchronous version.
+ */
+CV_EXPORTS void interpolateFrames(const GpuMat& frame0, const GpuMat& frame1,
+                                  const GpuMat& fu, const GpuMat& fv,
+                                  const GpuMat& bu, const GpuMat& bv,
+                                  float pos, GpuMat& newFrame, GpuMat& buf,
+                                  Stream& stream = Stream::Null());
+
+CV_EXPORTS void createOpticalFlowNeedleMap(const GpuMat& u, const GpuMat& v, GpuMat& vertex, GpuMat& colors);
+
 //! @}

 }}

--- a/modules/cudaoptflow/src/bm.cpp
+++ b/modules/cudaoptflow/src/bm.cpp
--- a/modules/cudaoptflow/src/bm_fast.cpp
+++ b/modules/cudaoptflow/src/bm_fast.cpp
--- a/modules/cudaoptflow/src/cuda/bm.cu
+++ b/modules/cudaoptflow/src/cuda/bm.cu
--- a/modules/cudaoptflow/src/cuda/bm_fast.cu
+++ b/modules/cudaoptflow/src/cuda/bm_fast.cu
--- a/modules/cudaoptflow/src/cuda/needle_map.cu
+++ b/modules/cudaoptflow/src/cuda/needle_map.cu
--- a/modules/cudaoptflow/src/interpolate_frames.cpp
+++ b/modules/cudaoptflow/src/interpolate_frames.cpp
--- a/modules/cudaoptflow/src/needle_map.cpp
+++ b/modules/cudaoptflow/src/needle_map.cpp
--- a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
+++ b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
--- a/modules/cudaoptflow/perf/perf_optflow.cpp
+++ b/modules/cudaoptflow/perf/perf_optflow.cpp
--- a/modules/cudaoptflow/perf/perf_precomp.hpp
+++ b/modules/cudaoptflow/perf/perf_precomp.hpp
@@ -55,6 +55,7 @@
 #include "opencv2/ts/cuda_perf.hpp"

 #include "opencv2/cudaoptflow.hpp"
+#include "opencv2/cudaarithm.hpp"
 #include "opencv2/video.hpp"

 #ifdef GTEST_CREATE_SHARED_LIBRARY

--- a/modules/cudaoptflow/src/brox.cpp
+++ b/modules/cudaoptflow/src/brox.cpp
@@ -47,84 +47,148 @@ using namespace cv::cuda;

 #if !defined (HAVE_CUDA) || !defined (HAVE_OPENCV_CUDALEGACY) || defined (CUDA_DISABLER)

-void cv::cuda::BroxOpticalFlow::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+Ptr<BroxOpticalFlow> cv::cuda::BroxOpticalFlow::create(double, double, double, int, int, int) { throw_no_cuda(); return Ptr<BroxOpticalFlow>(); }

 #else

-namespace
-{
-    size_t getBufSize(const NCVBroxOpticalFlowDescriptor& desc, const NCVMatrix<Ncv32f>& frame0, const NCVMatrix<Ncv32f>& frame1,
-                      NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v, const cudaDeviceProp& devProp)
+namespace {
+
+    class BroxOpticalFlowImpl : public BroxOpticalFlow
    {
-        NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
+    public:
+        BroxOpticalFlowImpl(double alpha, double gamma, double scale_factor,
+                            int inner_iterations, int outer_iterations, int solver_iterations) :
+            alpha_(alpha), gamma_(gamma), scale_factor_(scale_factor),
+            inner_iterations_(inner_iterations), outer_iterations_(outer_iterations),
+            solver_iterations_(solver_iterations)
+        {
+        }
+
+        virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream);
+
+        virtual double getFlowSmoothness() const { return alpha_; }
+        virtual void setFlowSmoothness(double alpha) { alpha_ = static_cast<float>(alpha); }
+
+        virtual double getGradientConstancyImportance() const { return gamma_; }
+        virtual void setGradientConstancyImportance(double gamma) { gamma_ = static_cast<float>(gamma); }
+
+        virtual double getPyramidScaleFactor() const { return scale_factor_; }
+        virtual void setPyramidScaleFactor(double scale_factor) { scale_factor_ = static_cast<float>(scale_factor); }
+
+        //! number of lagged non-linearity iterations (inner loop)
+        virtual int getInnerIterations() const { return inner_iterations_; }
+        virtual void setInnerIterations(int inner_iterations) { inner_iterations_ = inner_iterations; }
+
+        //! number of warping iterations (number of pyramid levels)
+        virtual int getOuterIterations() const { return outer_iterations_; }
+        virtual void setOuterIterations(int outer_iterations) { outer_iterations_ = outer_iterations; }
+
+        //! number of linear system solver iterations
+        virtual int getSolverIterations() const { return solver_iterations_; }
+        virtual void setSolverIterations(int solver_iterations) { solver_iterations_ = solver_iterations; }
+
+    private:
+        //! flow smoothness
+        float alpha_;
+
+        //! gradient constancy importance
+        float gamma_;
+
+        //! pyramid scale factor
+        float scale_factor_;
+
+        //! number of lagged non-linearity iterations (inner loop)
+        int inner_iterations_;
+
+        //! number of warping iterations (number of pyramid levels)
+        int outer_iterations_;
+
+        //! number of linear system solver iterations
+        int solver_iterations_;
+    };
+
+    static size_t getBufSize(const NCVBroxOpticalFlowDescriptor& desc,
+                             const NCVMatrix<Ncv32f>& frame0, const NCVMatrix<Ncv32f>& frame1,
+                             NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v,
+                             size_t textureAlignment)
+    {
+        NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(textureAlignment));

        ncvSafeCall( NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0) );

        return gpuCounter.maxSize();
    }
-}

-namespace
-{
-    static void outputHandler(const String &msg) { CV_Error(cv::Error::GpuApiCallError, msg.c_str()); }
-}
+    static void outputHandler(const String &msg)
+    {
+        CV_Error(cv::Error::GpuApiCallError, msg.c_str());
+    }

-void cv::cuda::BroxOpticalFlow::operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& s)
-{
-    ncvSetDebugOutputHandler(outputHandler);
+    void BroxOpticalFlowImpl::calc(InputArray _I0, InputArray _I1, InputOutputArray _flow, Stream& stream)
+    {
+        const GpuMat frame0 = _I0.getGpuMat();
+        const GpuMat frame1 = _I1.getGpuMat();

-    CV_Assert(frame0.type() == CV_32FC1);
-    CV_Assert(frame1.size() == frame0.size() && frame1.type() == frame0.type());
+        CV_Assert( frame0.type() == CV_32FC1 );
+        CV_Assert( frame1.size() == frame0.size() && frame1.type() == frame0.type() );

-    u.create(frame0.size(), CV_32FC1);
-    v.create(frame0.size(), CV_32FC1);
+        ncvSetDebugOutputHandler(outputHandler);

-    cudaDeviceProp devProp;
-    cudaSafeCall( cudaGetDeviceProperties(&devProp, getDevice()) );
+        BufferPool pool(stream);
+        GpuMat u = pool.getBuffer(frame0.size(), CV_32FC1);
+        GpuMat v = pool.getBuffer(frame0.size(), CV_32FC1);

-    NCVBroxOpticalFlowDescriptor desc;
+        NCVBroxOpticalFlowDescriptor desc;
+        desc.alpha = alpha_;
+        desc.gamma = gamma_;
+        desc.scale_factor = scale_factor_;
+        desc.number_of_inner_iterations = inner_iterations_;
+        desc.number_of_outer_iterations = outer_iterations_;
+        desc.number_of_solver_iterations = solver_iterations_;

-    desc.alpha = alpha;
-    desc.gamma = gamma;
-    desc.scale_factor = scale_factor;
-    desc.number_of_inner_iterations = inner_iterations;
-    desc.number_of_outer_iterations = outer_iterations;
-    desc.number_of_solver_iterations = solver_iterations;
+        NCVMemSegment frame0MemSeg;
+        frame0MemSeg.begin.memtype = NCVMemoryTypeDevice;
+        frame0MemSeg.begin.ptr = const_cast<uchar*>(frame0.data);
+        frame0MemSeg.size = frame0.step * frame0.rows;

-    NCVMemSegment frame0MemSeg;
-    frame0MemSeg.begin.memtype = NCVMemoryTypeDevice;
-    frame0MemSeg.begin.ptr = const_cast<uchar*>(frame0.data);
-    frame0MemSeg.size = frame0.step * frame0.rows;
+        NCVMemSegment frame1MemSeg;
+        frame1MemSeg.begin.memtype = NCVMemoryTypeDevice;
+        frame1MemSeg.begin.ptr = const_cast<uchar*>(frame1.data);
+        frame1MemSeg.size = frame1.step * frame1.rows;

-    NCVMemSegment frame1MemSeg;
-    frame1MemSeg.begin.memtype = NCVMemoryTypeDevice;
-    frame1MemSeg.begin.ptr = const_cast<uchar*>(frame1.data);
-    frame1MemSeg.size = frame1.step * frame1.rows;
+        NCVMemSegment uMemSeg;
+        uMemSeg.begin.memtype = NCVMemoryTypeDevice;
+        uMemSeg.begin.ptr = u.ptr();
+        uMemSeg.size = u.step * u.rows;

-    NCVMemSegment uMemSeg;
-    uMemSeg.begin.memtype = NCVMemoryTypeDevice;
-    uMemSeg.begin.ptr = u.ptr();
-    uMemSeg.size = u.step * u.rows;
+        NCVMemSegment vMemSeg;
+        vMemSeg.begin.memtype = NCVMemoryTypeDevice;
+        vMemSeg.begin.ptr = v.ptr();
+        vMemSeg.size = v.step * v.rows;

-    NCVMemSegment vMemSeg;
-    vMemSeg.begin.memtype = NCVMemoryTypeDevice;
-    vMemSeg.begin.ptr = v.ptr();
-    vMemSeg.size = v.step * v.rows;
+        DeviceInfo devInfo;
+        size_t textureAlignment = devInfo.textureAlignment();

-    NCVMatrixReuse<Ncv32f> frame0Mat(frame0MemSeg, static_cast<Ncv32u>(devProp.textureAlignment), frame0.cols, frame0.rows, static_cast<Ncv32u>(frame0.step));
-    NCVMatrixReuse<Ncv32f> frame1Mat(frame1MemSeg, static_cast<Ncv32u>(devProp.textureAlignment), frame1.cols, frame1.rows, static_cast<Ncv32u>(frame1.step));
-    NCVMatrixReuse<Ncv32f> uMat(uMemSeg, static_cast<Ncv32u>(devProp.textureAlignment), u.cols, u.rows, static_cast<Ncv32u>(u.step));
-    NCVMatrixReuse<Ncv32f> vMat(vMemSeg, static_cast<Ncv32u>(devProp.textureAlignment), v.cols, v.rows, static_cast<Ncv32u>(v.step));
+        NCVMatrixReuse<Ncv32f> frame0Mat(frame0MemSeg, static_cast<Ncv32u>(textureAlignment), frame0.cols, frame0.rows, static_cast<Ncv32u>(frame0.step));
+        NCVMatrixReuse<Ncv32f> frame1Mat(frame1MemSeg, static_cast<Ncv32u>(textureAlignment), frame1.cols, frame1.rows, static_cast<Ncv32u>(frame1.step));
+        NCVMatrixReuse<Ncv32f> uMat(uMemSeg, static_cast<Ncv32u>(textureAlignment), u.cols, u.rows, static_cast<Ncv32u>(u.step));
+        NCVMatrixReuse<Ncv32f> vMat(vMemSeg, static_cast<Ncv32u>(textureAlignment), v.cols, v.rows, static_cast<Ncv32u>(v.step));

-    cudaStream_t stream = StreamAccessor::getStream(s);
+        size_t bufSize = getBufSize(desc, frame0Mat, frame1Mat, uMat, vMat, textureAlignment);
+        GpuMat buf = pool.getBuffer(1, static_cast<int>(bufSize), CV_8UC1);

-    size_t bufSize = getBufSize(desc, frame0Mat, frame1Mat, uMat, vMat, devProp);
+        NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(textureAlignment), buf.ptr());

-    ensureSizeIsEnough(1, static_cast<int>(bufSize), CV_8UC1, buf);
+        ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, StreamAccessor::getStream(stream)) );

-    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), buf.ptr());
+        GpuMat flows[] = {u, v};
+        cuda::merge(flows, 2, _flow, stream);
+    }
+}

-    ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream) );
+Ptr<BroxOpticalFlow> cv::cuda::BroxOpticalFlow::create(double alpha, double gamma, double scale_factor, int inner_iterations, int outer_iterations, int solver_iterations)
+{
+    return makePtr<BroxOpticalFlowImpl>(alpha, gamma, scale_factor, inner_iterations, outer_iterations, solver_iterations);
 }

 #endif /* HAVE_CUDA */
--- a/modules/cudaoptflow/src/cuda/pyrlk.cu
+++ b/modules/cudaoptflow/src/cuda/pyrlk.cu
@@ -472,16 +472,16 @@ namespace pyrlk
        }
    }

-    void loadConstants(int2 winSize, int iters)
+    void loadConstants(int2 winSize, int iters, cudaStream_t stream)
    {
-        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) );
-        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_y, &winSize.y, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_x, &winSize.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_y, &winSize.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );

        int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
-        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_x, &halfWin.x, sizeof(int)) );
-        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_y, &halfWin.y, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_x, &halfWin.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_y, &halfWin.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );

-        cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_iters, &iters, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
    }

    void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,

--- a/modules/cudaoptflow/src/cuda/tvl1flow.cu
+++ b/modules/cudaoptflow/src/cuda/tvl1flow.cu
@@ -66,15 +66,16 @@ namespace tvl1flow
        dy(y, x) = 0.5f * (src(::min(y + 1, src.rows - 1), x) - src(::max(y - 1, 0), x));
    }

-    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy)
+    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy, cudaStream_t stream)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));

-        centeredGradientKernel<<<grid, block>>>(src, dx, dy);
+        centeredGradientKernel<<<grid, block, 0, stream>>>(src, dx, dy);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
 }

@@ -164,7 +165,10 @@ namespace tvl1flow
        rho(y, x) = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
    }

-    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho)
+    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y,
+                      PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx,
+                      PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho,
+                      cudaStream_t stream)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
@@ -173,10 +177,11 @@ namespace tvl1flow
        bindTexture(&tex_I1x, I1x);
        bindTexture(&tex_I1y, I1y);

-        warpBackwardKernel<<<grid, block>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho);
+        warpBackwardKernel<<<grid, block, 0, stream>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
 }

@@ -292,15 +297,17 @@ namespace tvl1flow
                   PtrStepSzf grad, PtrStepSzf rho_c,
                   PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
                   PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf error,
-                   float l_t, float theta, float gamma, bool calcError)
+                   float l_t, float theta, float gamma, bool calcError,
+                   cudaStream_t stream)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(I1wx.cols, block.x), divUp(I1wx.rows, block.y));

-        estimateUKernel<<<grid, block>>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, error, l_t, theta, gamma, calcError);
+        estimateUKernel<<<grid, block, 0, stream>>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, error, l_t, theta, gamma, calcError);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
 }

@@ -346,15 +353,19 @@ namespace tvl1flow
        }
    }

-    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32, float taut, float gamma)
+    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3,
+                               PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
+                               float taut, float gamma,
+                               cudaStream_t stream)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(u1.cols, block.x), divUp(u1.rows, block.y));

-        estimateDualVariablesKernel<<<grid, block>>>(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma);
+        estimateDualVariablesKernel<<<grid, block, 0, stream>>>(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
 }


--- a/modules/cudaoptflow/src/farneback.cpp
+++ b/modules/cudaoptflow/src/farneback.cpp
--- a/modules/cudaoptflow/src/pyrlk.cpp
+++ b/modules/cudaoptflow/src/pyrlk.cpp
--- a/modules/cudaoptflow/src/tvl1flow.cpp
+++ b/modules/cudaoptflow/src/tvl1flow.cpp
--- a/modules/cudaoptflow/test/test_optflow.cpp
+++ b/modules/cudaoptflow/test/test_optflow.cpp
--- a/modules/cudaoptflow/test/test_precomp.hpp
+++ b/modules/cudaoptflow/test/test_precomp.hpp
@@ -57,6 +57,7 @@
 #include "opencv2/ts/cuda_test.hpp"

 #include "opencv2/cudaoptflow.hpp"
+#include "opencv2/cudaarithm.hpp"
 #include "opencv2/video.hpp"

 #include "cvconfig.h"

--- a/modules/superres/src/optical_flow.cpp
+++ b/modules/superres/src/optical_flow.cpp
@@ -341,7 +341,7 @@ namespace
        int iterations_;
        bool useInitialFlow_;

-        Ptr<DenseOpticalFlow> alg_;
+        Ptr<cv::DenseOpticalFlow> alg_;
    };

    CV_INIT_ALGORITHM(DualTVL1, "DenseOpticalFlowExt.DualTVL1",
@@ -514,7 +514,7 @@ namespace
        int outerIterations_;
        int solverIterations_;

-        BroxOpticalFlow alg_;
+        Ptr<cuda::BroxOpticalFlow> alg_;
    };

    CV_INIT_ALGORITHM(Brox_CUDA, "DenseOpticalFlowExt.Brox_CUDA",
@@ -525,31 +525,40 @@ namespace
                      obj.info()->addParam(obj, "outerIterations", obj.outerIterations_, false, 0, 0, "Number of warping iterations (number of pyramid levels)");
                      obj.info()->addParam(obj, "solverIterations", obj.solverIterations_, false, 0, 0, "Number of linear system solver iterations"))

-    Brox_CUDA::Brox_CUDA() : GpuOpticalFlow(CV_32FC1), alg_(0.197f, 50.0f, 0.8f, 10, 77, 10)
+    Brox_CUDA::Brox_CUDA() : GpuOpticalFlow(CV_32FC1)
    {
-        alpha_ = alg_.alpha;
-        gamma_ = alg_.gamma;
-        scaleFactor_ = alg_.scale_factor;
-        innerIterations_ = alg_.inner_iterations;
-        outerIterations_ = alg_.outer_iterations;
-        solverIterations_ = alg_.solver_iterations;
+        alg_ = cuda::BroxOpticalFlow::create(0.197f, 50.0f, 0.8f, 10, 77, 10);
+
+        alpha_ = alg_->getFlowSmoothness();
+        gamma_ = alg_->getGradientConstancyImportance();
+        scaleFactor_ = alg_->getPyramidScaleFactor();
+        innerIterations_ = alg_->getInnerIterations();
+        outerIterations_ = alg_->getOuterIterations();
+        solverIterations_ = alg_->getSolverIterations();
    }

    void Brox_CUDA::impl(const GpuMat& input0, const GpuMat& input1, GpuMat& dst1, GpuMat& dst2)
    {
-        alg_.alpha = static_cast<float>(alpha_);
-        alg_.gamma = static_cast<float>(gamma_);
-        alg_.scale_factor = static_cast<float>(scaleFactor_);
-        alg_.inner_iterations = innerIterations_;
-        alg_.outer_iterations = outerIterations_;
-        alg_.solver_iterations = solverIterations_;
+        alg_->setFlowSmoothness(alpha_);
+        alg_->setGradientConstancyImportance(gamma_);
+        alg_->setPyramidScaleFactor(scaleFactor_);
+        alg_->setInnerIterations(innerIterations_);
+        alg_->setOuterIterations(outerIterations_);
+        alg_->setSolverIterations(solverIterations_);
+
+        GpuMat flow;
+        alg_->calc(input0, input1, flow);
+
+        GpuMat flows[2];
+        cuda::split(flow, flows);

-        alg_(input0, input1, dst1, dst2);
+        dst1 = flows[0];
+        dst2 = flows[1];
    }

    void Brox_CUDA::collectGarbage()
    {
-        alg_.buf.release();
+        alg_ = cuda::BroxOpticalFlow::create(alpha_, gamma_, scaleFactor_, innerIterations_, outerIterations_, solverIterations_);
        GpuOpticalFlow::collectGarbage();
    }
 }
@@ -581,7 +590,7 @@ namespace
        int maxLevel_;
        int iterations_;

-        PyrLKOpticalFlow alg_;
+        Ptr<cuda::DensePyrLKOpticalFlow> alg_;
    };

    CV_INIT_ALGORITHM(PyrLK_CUDA, "DenseOpticalFlowExt.PyrLK_CUDA",
@@ -591,24 +600,32 @@ namespace

    PyrLK_CUDA::PyrLK_CUDA() : GpuOpticalFlow(CV_8UC1)
    {
-        winSize_ = alg_.winSize.width;
-        maxLevel_ = alg_.maxLevel;
-        iterations_ = alg_.iters;
+        alg_ = cuda::DensePyrLKOpticalFlow::create();
+
+        winSize_ = alg_->getWinSize().width;
+        maxLevel_ = alg_->getMaxLevel();
+        iterations_ = alg_->getNumIters();
    }

    void PyrLK_CUDA::impl(const GpuMat& input0, const GpuMat& input1, GpuMat& dst1, GpuMat& dst2)
    {
-        alg_.winSize.width = winSize_;
-        alg_.winSize.height = winSize_;
-        alg_.maxLevel = maxLevel_;
-        alg_.iters = iterations_;
+        alg_->setWinSize(Size(winSize_, winSize_));
+        alg_->setMaxLevel(maxLevel_);
+        alg_->setNumIters(iterations_);
+
+        GpuMat flow;
+        alg_->calc(input0, input1, flow);
+
+        GpuMat flows[2];
+        cuda::split(flow, flows);

-        alg_.dense(input0, input1, dst1, dst2);
+        dst1 = flows[0];
+        dst2 = flows[1];
    }

    void PyrLK_CUDA::collectGarbage()
    {
-        alg_.releaseMemory();
+        alg_ = cuda::DensePyrLKOpticalFlow::create();
        GpuOpticalFlow::collectGarbage();
    }
 }
@@ -644,7 +661,7 @@ namespace
        double polySigma_;
        int flags_;

-        FarnebackOpticalFlow alg_;
+        Ptr<cuda::FarnebackOpticalFlow> alg_;
    };

    CV_INIT_ALGORITHM(Farneback_CUDA, "DenseOpticalFlowExt.Farneback_CUDA",
@@ -658,31 +675,40 @@ namespace

    Farneback_CUDA::Farneback_CUDA() : GpuOpticalFlow(CV_8UC1)
    {
-        pyrScale_ = alg_.pyrScale;
-        numLevels_ = alg_.numLevels;
-        winSize_ = alg_.winSize;
-        numIters_ = alg_.numIters;
-        polyN_ = alg_.polyN;
-        polySigma_ = alg_.polySigma;
-        flags_ = alg_.flags;
+        alg_ = cuda::FarnebackOpticalFlow::create();
+
+        pyrScale_ = alg_->getPyrScale();
+        numLevels_ = alg_->getNumLevels();
+        winSize_ = alg_->getWinSize();
+        numIters_ = alg_->getNumIters();
+        polyN_ = alg_->getPolyN();
+        polySigma_ = alg_->getPolySigma();
+        flags_ = alg_->getFlags();
    }

    void Farneback_CUDA::impl(const GpuMat& input0, const GpuMat& input1, GpuMat& dst1, GpuMat& dst2)
    {
-        alg_.pyrScale = pyrScale_;
-        alg_.numLevels = numLevels_;
-        alg_.winSize = winSize_;
-        alg_.numIters = numIters_;
-        alg_.polyN = polyN_;
-        alg_.polySigma = polySigma_;
-        alg_.flags = flags_;
+        alg_->setPyrScale(pyrScale_);
+        alg_->setNumLevels(numLevels_);
+        alg_->setWinSize(winSize_);
+        alg_->setNumIters(numIters_);
+        alg_->setPolyN(polyN_);
+        alg_->setPolySigma(polySigma_);
+        alg_->setFlags(flags_);
+
+        GpuMat flow;
+        alg_->calc(input0, input1, flow);
+
+        GpuMat flows[2];
+        cuda::split(flow, flows);

-        alg_(input0, input1, dst1, dst2);
+        dst1 = flows[0];
+        dst2 = flows[1];
    }

    void Farneback_CUDA::collectGarbage()
    {
-        alg_.releaseMemory();
+        alg_ = cuda::FarnebackOpticalFlow::create();
        GpuOpticalFlow::collectGarbage();
    }
 }
@@ -719,7 +745,7 @@ namespace
        int iterations_;
        bool useInitialFlow_;

-        OpticalFlowDual_TVL1_CUDA alg_;
+        Ptr<cuda::OpticalFlowDual_TVL1> alg_;
    };

    CV_INIT_ALGORITHM(DualTVL1_CUDA, "DenseOpticalFlowExt.DualTVL1_CUDA",
@@ -734,33 +760,42 @@ namespace

    DualTVL1_CUDA::DualTVL1_CUDA() : GpuOpticalFlow(CV_8UC1)
    {
-        tau_ = alg_.tau;
-        lambda_ = alg_.lambda;
-        theta_ = alg_.theta;
-        nscales_ = alg_.nscales;
-        warps_ = alg_.warps;
-        epsilon_ = alg_.epsilon;
-        iterations_ = alg_.iterations;
-        useInitialFlow_ = alg_.useInitialFlow;
+        alg_ = cuda::OpticalFlowDual_TVL1::create();
+
+        tau_ = alg_->getTau();
+        lambda_ = alg_->getLambda();
+        theta_ = alg_->getTheta();
+        nscales_ = alg_->getNumScales();
+        warps_ = alg_->getNumWarps();
+        epsilon_ = alg_->getEpsilon();
+        iterations_ = alg_->getNumIterations();
+        useInitialFlow_ = alg_->getUseInitialFlow();
    }

    void DualTVL1_CUDA::impl(const GpuMat& input0, const GpuMat& input1, GpuMat& dst1, GpuMat& dst2)
    {
-        alg_.tau = tau_;
-        alg_.lambda = lambda_;
-        alg_.theta = theta_;
-        alg_.nscales = nscales_;
-        alg_.warps = warps_;
-        alg_.epsilon = epsilon_;
-        alg_.iterations = iterations_;
-        alg_.useInitialFlow = useInitialFlow_;
+        alg_->setTau(tau_);
+        alg_->setLambda(lambda_);
+        alg_->setTheta(theta_);
+        alg_->setNumScales(nscales_);
+        alg_->setNumWarps(warps_);
+        alg_->setEpsilon(epsilon_);
+        alg_->setNumIterations(iterations_);
+        alg_->setUseInitialFlow(useInitialFlow_);
+
+        GpuMat flow;
+        alg_->calc(input0, input1, flow);
+
+        GpuMat flows[2];
+        cuda::split(flow, flows);

-        alg_(input0, input1, dst1, dst2);
+        dst1 = flows[0];
+        dst2 = flows[1];
    }

    void DualTVL1_CUDA::collectGarbage()
    {
-        alg_.collectGarbage();
+        alg_ = cuda::OpticalFlowDual_TVL1::create();
        GpuOpticalFlow::collectGarbage();
    }
 }

--- a/modules/videostab/include/opencv2/videostab/optical_flow.hpp
+++ b/modules/videostab/include/opencv2/videostab/optical_flow.hpp
@@ -121,7 +121,7 @@ public:
             cuda::GpuMat &status);

 private:
-    cuda::PyrLKOpticalFlow optFlowEstimator_;
+    Ptr<cuda::SparsePyrLKOpticalFlow> optFlowEstimator_;
    cuda::GpuMat frame0_, frame1_, points0_, points1_, status_, errors_;
 };

@@ -136,7 +136,7 @@ public:
            OutputArray errors);

 private:
-    cuda::PyrLKOpticalFlow optFlowEstimator_;
+    Ptr<cuda::DensePyrLKOpticalFlow> optFlowEstimator_;
    cuda::GpuMat frame0_, frame1_, flowX_, flowY_, errors_;
 };


--- a/modules/videostab/src/optical_flow.cpp
+++ b/modules/videostab/src/optical_flow.cpp
@@ -45,6 +45,10 @@
 #include "opencv2/videostab/optical_flow.hpp"
 #include "opencv2/videostab/ring_buffer.hpp"

+#ifdef HAVE_OPENCV_CUDAARITHM
+  #include "opencv2/cudaarithm.hpp"
+#endif
+
 namespace cv
 {
 namespace videostab
@@ -63,6 +67,7 @@ void SparsePyrLkOptFlowEstimator::run(
 SparsePyrLkOptFlowEstimatorGpu::SparsePyrLkOptFlowEstimatorGpu()
 {
    CV_Assert(cuda::getCudaEnabledDeviceCount() > 0);
+    optFlowEstimator_ = cuda::SparsePyrLKOpticalFlow::create();
 }


@@ -91,9 +96,9 @@ void SparsePyrLkOptFlowEstimatorGpu::run(
        const cuda::GpuMat &frame0, const cuda::GpuMat &frame1, const cuda::GpuMat &points0,
        cuda::GpuMat &points1, cuda::GpuMat &status, cuda::GpuMat &errors)
 {
-    optFlowEstimator_.winSize = winSize_;
-    optFlowEstimator_.maxLevel = maxLevel_;
-    optFlowEstimator_.sparse(frame0, frame1, points0, points1, status, &errors);
+    optFlowEstimator_->setWinSize(winSize_);
+    optFlowEstimator_->setMaxLevel(maxLevel_);
+    optFlowEstimator_->calc(frame0, frame1, points0, points1, status, errors);
 }


@@ -101,15 +106,16 @@ void SparsePyrLkOptFlowEstimatorGpu::run(
        const cuda::GpuMat &frame0, const cuda::GpuMat &frame1, const cuda::GpuMat &points0,
        cuda::GpuMat &points1, cuda::GpuMat &status)
 {
-    optFlowEstimator_.winSize = winSize_;
-    optFlowEstimator_.maxLevel = maxLevel_;
-    optFlowEstimator_.sparse(frame0, frame1, points0, points1, status);
+    optFlowEstimator_->setWinSize(winSize_);
+    optFlowEstimator_->setMaxLevel(maxLevel_);
+    optFlowEstimator_->calc(frame0, frame1, points0, points1, status);
 }


 DensePyrLkOptFlowEstimatorGpu::DensePyrLkOptFlowEstimatorGpu()
 {
    CV_Assert(cuda::getCudaEnabledDeviceCount() > 0);
+    optFlowEstimator_ = cuda::DensePyrLKOpticalFlow::create();
 }


@@ -120,16 +126,24 @@ void DensePyrLkOptFlowEstimatorGpu::run(
    frame0_.upload(frame0.getMat());
    frame1_.upload(frame1.getMat());

-    optFlowEstimator_.winSize = winSize_;
-    optFlowEstimator_.maxLevel = maxLevel_;
+    optFlowEstimator_->setWinSize(winSize_);
+    optFlowEstimator_->setMaxLevel(maxLevel_);

    if (errors.needed())
    {
-        optFlowEstimator_.dense(frame0_, frame1_, flowX_, flowY_, &errors_);
-        errors_.download(errors.getMatRef());
+        CV_Error(Error::StsNotImplemented, "DensePyrLkOptFlowEstimatorGpu doesn't support errors calculation");
    }
    else
-        optFlowEstimator_.dense(frame0_, frame1_, flowX_, flowY_);
+    {
+        cuda::GpuMat flow;
+        optFlowEstimator_->calc(frame0_, frame1_, flow);
+
+        cuda::GpuMat flows[2];
+        cuda::split(flow, flows);
+
+        flowX_ = flows[0];
+        flowY_ = flows[1];
+    }

    flowX_.download(flowX.getMatRef());
    flowY_.download(flowY.getMatRef());

--- a/samples/gpu/brox_optical_flow.cpp
+++ b/samples/gpu/brox_optical_flow.cpp
-#include <iostream>
-#include <iomanip>
-#include <string>
-#include <ctype.h>
-
-#include "opencv2/core.hpp"
-#include "opencv2/core/utility.hpp"
-#include "opencv2/highgui.hpp"
-#include "opencv2/imgproc.hpp"
-#include "opencv2/cudaoptflow.hpp"
-#include "opencv2/cudaarithm.hpp"
-
-using namespace std;
-using namespace cv;
-using namespace cv::cuda;
-
-void getFlowField(const Mat& u, const Mat& v, Mat& flowField);
-
-int main(int argc, const char* argv[])
-{
-    try
-    {
-        const char* keys =
-           "{ h   help      |       | print help message }"
-           "{ l   left      |       | specify left image }"
-           "{ r   right     |       | specify right image }"
-           "{ s   scale     | 0.8   | set pyramid scale factor }"
-           "{ a   alpha     | 0.197 | set alpha }"
-           "{ g   gamma     | 50.0  | set gamma }"
-           "{ i   inner     | 10    | set number of inner iterations }"
-           "{ o   outer     | 77    | set number of outer iterations }"
-           "{ si  solver    | 10    | set number of basic solver iterations }"
-           "{ t   time_step | 0.1   | set frame interpolation time step }";
-
-        CommandLineParser cmd(argc, argv, keys);
-
-        if (cmd.has("help") || !cmd.check())
-        {
-            cmd.printMessage();
-            cmd.printErrors();
-            return 0;
-        }
-
-        string frame0Name = cmd.get<string>("left");
-        string frame1Name = cmd.get<string>("right");
-        float scale = cmd.get<float>("scale");
-        float alpha = cmd.get<float>("alpha");
-        float gamma = cmd.get<float>("gamma");
-        int inner_iterations = cmd.get<int>("inner");
-        int outer_iterations = cmd.get<int>("outer");
-        int solver_iterations = cmd.get<int>("solver");
-        float timeStep = cmd.get<float>("time_step");
-
-        if (frame0Name.empty() || frame1Name.empty())
-        {
-            cerr << "Missing input file names" << endl;
-            return -1;
-        }
-
-        Mat frame0Color = imread(frame0Name);
-        Mat frame1Color = imread(frame1Name);
-
-        if (frame0Color.empty() || frame1Color.empty())
-        {
-            cout << "Can't load input images" << endl;
-            return -1;
-        }
-
-        cv::cuda::printShortCudaDeviceInfo(cv::cuda::getDevice());
-
-        cout << "OpenCV / NVIDIA Computer Vision" << endl;
-        cout << "Optical Flow Demo: Frame Interpolation" << endl;
-        cout << "=========================================" << endl;
-
-        namedWindow("Forward flow");
-        namedWindow("Backward flow");
-
-        namedWindow("Interpolated frame");
-
-        cout << "Press:" << endl;
-        cout << "\tESC to quit" << endl;
-        cout << "\t'a' to move to the previous frame" << endl;
-        cout << "\t's' to move to the next frame\n" << endl;
-
-        frame0Color.convertTo(frame0Color, CV_32F, 1.0 / 255.0);
-        frame1Color.convertTo(frame1Color, CV_32F, 1.0 / 255.0);
-
-        Mat frame0Gray, frame1Gray;
-
-        cv::cvtColor(frame0Color, frame0Gray, COLOR_BGR2GRAY);
-        cv::cvtColor(frame1Color, frame1Gray, COLOR_BGR2GRAY);
-
-        GpuMat d_frame0(frame0Gray);
-        GpuMat d_frame1(frame1Gray);
-
-        cout << "Estimating optical flow" << endl;
-
-        BroxOpticalFlow d_flow(alpha, gamma, scale, inner_iterations, outer_iterations, solver_iterations);
-
-        cout << "\tForward..." << endl;
-
-        GpuMat d_fu, d_fv;
-
-        d_flow(d_frame0, d_frame1, d_fu, d_fv);
-
-        Mat flowFieldForward;
-        getFlowField(Mat(d_fu), Mat(d_fv), flowFieldForward);
-
-        cout << "\tBackward..." << endl;
-
-        GpuMat d_bu, d_bv;
-
-        d_flow(d_frame1, d_frame0, d_bu, d_bv);
-
-        Mat flowFieldBackward;
-        getFlowField(Mat(d_bu), Mat(d_bv), flowFieldBackward);
-
-        cout << "Interpolating..." << endl;
-
-        // first frame color components
-        GpuMat d_b, d_g, d_r;
-
-        // second frame color components
-        GpuMat d_bt, d_gt, d_rt;
-
-        // prepare color components on host and copy them to device memory
-        Mat channels[3];
-        cv::split(frame0Color, channels);
-
-        d_b.upload(channels[0]);
-        d_g.upload(channels[1]);
-        d_r.upload(channels[2]);
-
-        cv::split(frame1Color, channels);
-
-        d_bt.upload(channels[0]);
-        d_gt.upload(channels[1]);
-        d_rt.upload(channels[2]);
-
-        // temporary buffer
-        GpuMat d_buf;
-
-        // intermediate frame color components (GPU memory)
-        GpuMat d_rNew, d_gNew, d_bNew;
-
-        GpuMat d_newFrame;
-
-        vector<Mat> frames;
-        frames.reserve(static_cast<int>(1.0f / timeStep) + 2);
-
-        frames.push_back(frame0Color);
-
-        // compute interpolated frames
-        for (float timePos = timeStep; timePos < 1.0f; timePos += timeStep)
-        {
-            // interpolate blue channel
-            interpolateFrames(d_b, d_bt, d_fu, d_fv, d_bu, d_bv, timePos, d_bNew, d_buf);
-
-            // interpolate green channel
-            interpolateFrames(d_g, d_gt, d_fu, d_fv, d_bu, d_bv, timePos, d_gNew, d_buf);
-
-            // interpolate red channel
-            interpolateFrames(d_r, d_rt, d_fu, d_fv, d_bu, d_bv, timePos, d_rNew, d_buf);
-
-            GpuMat channels3[] = {d_bNew, d_gNew, d_rNew};
-            cuda::merge(channels3, 3, d_newFrame);
-
-            frames.push_back(Mat(d_newFrame));
-
-            cout << setprecision(4) << timePos * 100.0f << "%\r";
-        }
-
-        frames.push_back(frame1Color);
-
-        cout << setw(5) << "100%" << endl;
-
-        cout << "Done" << endl;
-
-        imshow("Forward flow", flowFieldForward);
-        imshow("Backward flow", flowFieldBackward);
-
-        int currentFrame = 0;
-
-        imshow("Interpolated frame", frames[currentFrame]);
-
-        for(;;)
-        {
-            int key = toupper(waitKey(10) & 0xff);
-
-            switch (key)
-            {
-            case 27:
-                return 0;
-
-            case 'A':
-                if (currentFrame > 0)
-                    --currentFrame;
-
-                imshow("Interpolated frame", frames[currentFrame]);
-                break;
-
-            case 'S':
-                if (currentFrame < static_cast<int>(frames.size()) - 1)
-                    ++currentFrame;
-
-                imshow("Interpolated frame", frames[currentFrame]);
-                break;
-            }
-        }
-    }
-    catch (const exception& ex)
-    {
-        cerr << ex.what() << endl;
-        return -1;
-    }
-    catch (...)
-    {
-        cerr << "Unknow error" << endl;
-        return -1;
-    }
-}
-
-template <typename T> inline T clamp (T x, T a, T b)
-{
-    return ((x) > (a) ? ((x) < (b) ? (x) : (b)) : (a));
-}
-
-template <typename T> inline T mapValue(T x, T a, T b, T c, T d)
-{
-    x = clamp(x, a, b);
-    return c + (d - c) * (x - a) / (b - a);
-}
-
-void getFlowField(const Mat& u, const Mat& v, Mat& flowField)
-{
-    float maxDisplacement = 1.0f;
-
-    for (int i = 0; i < u.rows; ++i)
-    {
-        const float* ptr_u = u.ptr<float>(i);
-        const float* ptr_v = v.ptr<float>(i);
-
-        for (int j = 0; j < u.cols; ++j)
-        {
-            float d = max(fabsf(ptr_u[j]), fabsf(ptr_v[j]));
-
-            if (d > maxDisplacement)
-                maxDisplacement = d;
-        }
-    }
-
-    flowField.create(u.size(), CV_8UC4);
-
-    for (int i = 0; i < flowField.rows; ++i)
-    {
-        const float* ptr_u = u.ptr<float>(i);
-        const float* ptr_v = v.ptr<float>(i);
-
-
-        Vec4b* row = flowField.ptr<Vec4b>(i);
-
-        for (int j = 0; j < flowField.cols; ++j)
-        {
-            row[j][0] = 0;
-            row[j][1] = static_cast<unsigned char> (mapValue (-ptr_v[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
-            row[j][2] = static_cast<unsigned char> (mapValue ( ptr_u[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
-            row[j][3] = 255;
-        }
-    }
-}
--- a/samples/gpu/farneback_optical_flow.cpp
+++ b/samples/gpu/farneback_optical_flow.cpp
@@ -7,6 +7,7 @@
 #include "opencv2/highgui.hpp"
 #include "opencv2/video.hpp"
 #include "opencv2/cudaoptflow.hpp"
+#include "opencv2/cudaarithm.hpp"

 using namespace std;
 using namespace cv;
@@ -70,8 +71,8 @@ int main(int argc, char **argv)
    if (frameL.empty() || frameR.empty()) return -1;

    GpuMat d_frameL(frameL), d_frameR(frameR);
-    GpuMat d_flowx, d_flowy;
-    FarnebackOpticalFlow d_calc;
+    GpuMat d_flow;
+    Ptr<cuda::FarnebackOpticalFlow> d_calc = cuda::FarnebackOpticalFlow::create();
    Mat flowxy, flowx, flowy, image;

    bool running = true, gpuMode = true;
@@ -86,17 +87,21 @@ int main(int argc, char **argv)
        if (gpuMode)
        {
            tc0 = getTickCount();
-            d_calc(d_frameL, d_frameR, d_flowx, d_flowy);
+            d_calc->calc(d_frameL, d_frameR, d_flow);
            tc1 = getTickCount();
-            d_flowx.download(flowx);
-            d_flowy.download(flowy);
+
+            GpuMat planes[2];
+            cuda::split(d_flow, planes);
+
+            planes[0].download(flowx);
+            planes[1].download(flowy);
        }
        else
        {
            tc0 = getTickCount();
            calcOpticalFlowFarneback(
-                        frameL, frameR, flowxy, d_calc.pyrScale, d_calc.numLevels, d_calc.winSize,
-                        d_calc.numIters, d_calc.polyN, d_calc.polySigma, d_calc.flags);
+                        frameL, frameR, flowxy, d_calc->getPyrScale(), d_calc->getNumLevels(), d_calc->getWinSize(),
+                        d_calc->getNumIters(), d_calc->getPolyN(), d_calc->getPolySigma(), d_calc->getFlags());
            tc1 = getTickCount();

            Mat planes[] = {flowx, flowy};

--- a/samples/gpu/optical_flow.cpp
+++ b/samples/gpu/optical_flow.cpp
@@ -5,6 +5,7 @@
 #include <opencv2/core/utility.hpp>
 #include "opencv2/highgui.hpp"
 #include "opencv2/cudaoptflow.hpp"
+#include "opencv2/cudaarithm.hpp"

 using namespace std;
 using namespace cv;
@@ -122,10 +123,13 @@ static void drawOpticalFlow(const Mat_<float>& flowx, const Mat_<float>& flowy,
    }
 }

-static void showFlow(const char* name, const GpuMat& d_flowx, const GpuMat& d_flowy)
+static void showFlow(const char* name, const GpuMat& d_flow)
 {
-    Mat flowx(d_flowx);
-    Mat flowy(d_flowy);
+    GpuMat planes[2];
+    cuda::split(d_flow, planes);
+
+    Mat flowx(planes[0]);
+    Mat flowy(planes[1]);

    Mat out;
    drawOpticalFlow(flowx, flowy, out, 10);
@@ -171,14 +175,12 @@ int main(int argc, const char* argv[])
    GpuMat d_frame0(frame0);
    GpuMat d_frame1(frame1);

-    GpuMat d_flowx(frame0.size(), CV_32FC1);
-    GpuMat d_flowy(frame0.size(), CV_32FC1);
+    GpuMat d_flow(frame0.size(), CV_32FC2);

-    BroxOpticalFlow brox(0.197f, 50.0f, 0.8f, 10, 77, 10);
-    PyrLKOpticalFlow lk; lk.winSize = Size(7, 7);
-    FarnebackOpticalFlow farn;
-    OpticalFlowDual_TVL1_CUDA tvl1;
-    FastOpticalFlowBM fastBM;
+    Ptr<cuda::BroxOpticalFlow> brox = cuda::BroxOpticalFlow::create(0.197f, 50.0f, 0.8f, 10, 77, 10);
+    Ptr<cuda::DensePyrLKOpticalFlow> lk = cuda::DensePyrLKOpticalFlow::create(Size(7, 7));
+    Ptr<cuda::FarnebackOpticalFlow> farn = cuda::FarnebackOpticalFlow::create();
+    Ptr<cuda::OpticalFlowDual_TVL1> tvl1 = cuda::OpticalFlowDual_TVL1::create();

    {
        GpuMat d_frame0f;
@@ -189,68 +191,45 @@ int main(int argc, const char* argv[])

        const int64 start = getTickCount();

-        brox(d_frame0f, d_frame1f, d_flowx, d_flowy);
+        brox->calc(d_frame0f, d_frame1f, d_flow);

        const double timeSec = (getTickCount() - start) / getTickFrequency();
        cout << "Brox : " << timeSec << " sec" << endl;

-        showFlow("Brox", d_flowx, d_flowy);
+        showFlow("Brox", d_flow);
    }

    {
        const int64 start = getTickCount();

-        lk.dense(d_frame0, d_frame1, d_flowx, d_flowy);
+        lk->calc(d_frame0, d_frame1, d_flow);

        const double timeSec = (getTickCount() - start) / getTickFrequency();
        cout << "LK : " << timeSec << " sec" << endl;

-        showFlow("LK", d_flowx, d_flowy);
+        showFlow("LK", d_flow);
    }

    {
        const int64 start = getTickCount();

-        farn(d_frame0, d_frame1, d_flowx, d_flowy);
+        farn->calc(d_frame0, d_frame1, d_flow);

        const double timeSec = (getTickCount() - start) / getTickFrequency();
        cout << "Farn : " << timeSec << " sec" << endl;

-        showFlow("Farn", d_flowx, d_flowy);
+        showFlow("Farn", d_flow);
    }

    {
        const int64 start = getTickCount();

-        tvl1(d_frame0, d_frame1, d_flowx, d_flowy);
+        tvl1->calc(d_frame0, d_frame1, d_flow);

        const double timeSec = (getTickCount() - start) / getTickFrequency();
        cout << "TVL1 : " << timeSec << " sec" << endl;

-        showFlow("TVL1", d_flowx, d_flowy);
-    }
-
-    {
-        const int64 start = getTickCount();
-
-        GpuMat buf;
-        calcOpticalFlowBM(d_frame0, d_frame1, Size(7, 7), Size(1, 1), Size(21, 21), false, d_flowx, d_flowy, buf);
-
-        const double timeSec = (getTickCount() - start) / getTickFrequency();
-        cout << "BM : " << timeSec << " sec" << endl;
-
-        showFlow("BM", d_flowx, d_flowy);
-    }
-
-    {
-        const int64 start = getTickCount();
-
-        fastBM(d_frame0, d_frame1, d_flowx, d_flowy);
-
-        const double timeSec = (getTickCount() - start) / getTickFrequency();
-        cout << "Fast BM : " << timeSec << " sec" << endl;
-
-        showFlow("Fast BM", d_flowx, d_flowy);
+        showFlow("TVL1", d_flow);
    }

    imshow("Frame 0", frame0);

--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -1187,87 +1187,6 @@ TEST(GoodFeaturesToTrack)
    CUDA_OFF;
 }

-TEST(PyrLKOpticalFlow)
-{
-    Mat frame0 = imread(abspath("../data/rubberwhale1.png"));
-    if (frame0.empty()) throw runtime_error("can't open ../data/rubberwhale1.png");
-
-    Mat frame1 = imread(abspath("../data/rubberwhale2.png"));
-    if (frame1.empty()) throw runtime_error("can't open ../data/rubberwhale2.png");
-
-    Mat gray_frame;
-    cvtColor(frame0, gray_frame, COLOR_BGR2GRAY);
-
-    for (int points = 1000; points <= 8000; points *= 2)
-    {
-        SUBTEST << points;
-
-        vector<Point2f> pts;
-        goodFeaturesToTrack(gray_frame, pts, points, 0.01, 0.0);
-
-        vector<Point2f> nextPts;
-        vector<unsigned char> status;
-
-        vector<float> err;
-
-        calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
-
-        CPU_ON;
-        calcOpticalFlowPyrLK(frame0, frame1, pts, nextPts, status, err);
-        CPU_OFF;
-
-        cuda::PyrLKOpticalFlow d_pyrLK;
-
-        cuda::GpuMat d_frame0(frame0);
-        cuda::GpuMat d_frame1(frame1);
-
-        cuda::GpuMat d_pts;
-        Mat pts_mat(1, (int)pts.size(), CV_32FC2, (void*)&pts[0]);
-        d_pts.upload(pts_mat);
-
-        cuda::GpuMat d_nextPts;
-        cuda::GpuMat d_status;
-        cuda::GpuMat d_err;
-
-        d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
-
-        CUDA_ON;
-        d_pyrLK.sparse(d_frame0, d_frame1, d_pts, d_nextPts, d_status, &d_err);
-        CUDA_OFF;
-    }
-}
-
-
-TEST(FarnebackOpticalFlow)
-{
-    const string datasets[] = {"../data/rubberwhale", "../data/basketball"};
-    for (size_t i = 0; i < sizeof(datasets)/sizeof(*datasets); ++i) {
-    for (int fastPyramids = 0; fastPyramids < 2; ++fastPyramids) {
-    for (int useGaussianBlur = 0; useGaussianBlur < 2; ++useGaussianBlur) {
-
-    SUBTEST << "dataset=" << datasets[i] << ", fastPyramids=" << fastPyramids << ", useGaussianBlur=" << useGaussianBlur;
-    Mat frame0 = imread(abspath(datasets[i] + "1.png"), IMREAD_GRAYSCALE);
-    Mat frame1 = imread(abspath(datasets[i] + "2.png"), IMREAD_GRAYSCALE);
-    if (frame0.empty()) throw runtime_error("can't open " + datasets[i] + "1.png");
-    if (frame1.empty()) throw runtime_error("can't open " + datasets[i] + "2.png");
-
-    cuda::FarnebackOpticalFlow calc;
-    calc.fastPyramids = fastPyramids != 0;
-    calc.flags |= useGaussianBlur ? OPTFLOW_FARNEBACK_GAUSSIAN : 0;
-
-    cuda::GpuMat d_frame0(frame0), d_frame1(frame1), d_flowx, d_flowy;
-    CUDA_ON;
-    calc(d_frame0, d_frame1, d_flowx, d_flowy);
-    CUDA_OFF;
-
-    Mat flow;
-    CPU_ON;
-    calcOpticalFlowFarneback(frame0, frame1, flow, calc.pyrScale, calc.numLevels, calc.winSize, calc.numIters, calc.polyN, calc.polySigma, calc.flags);
-    CPU_OFF;
-
-    }}}
-}
-
 #ifdef HAVE_OPENCV_BGSEGM

 TEST(MOG)

--- a/samples/gpu/pyrlk_optical_flow.cpp
+++ b/samples/gpu/pyrlk_optical_flow.cpp
@@ -77,44 +77,6 @@ template <typename T> inline T mapValue(T x, T a, T b, T c, T d)
    return c + (d - c) * (x - a) / (b - a);
 }

-static void getFlowField(const Mat& u, const Mat& v, Mat& flowField)
-{
-    float maxDisplacement = 1.0f;
-
-    for (int i = 0; i < u.rows; ++i)
-    {
-        const float* ptr_u = u.ptr<float>(i);
-        const float* ptr_v = v.ptr<float>(i);
-
-        for (int j = 0; j < u.cols; ++j)
-        {
-            float d = max(fabsf(ptr_u[j]), fabsf(ptr_v[j]));
-
-            if (d > maxDisplacement)
-                maxDisplacement = d;
-        }
-    }
-
-    flowField.create(u.size(), CV_8UC4);
-
-    for (int i = 0; i < flowField.rows; ++i)
-    {
-        const float* ptr_u = u.ptr<float>(i);
-        const float* ptr_v = v.ptr<float>(i);
-
-
-        Vec4b* row = flowField.ptr<Vec4b>(i);
-
-        for (int j = 0; j < flowField.cols; ++j)
-        {
-            row[j][0] = 0;
-            row[j][1] = static_cast<unsigned char> (mapValue (-ptr_v[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
-            row[j][2] = static_cast<unsigned char> (mapValue ( ptr_u[j], -maxDisplacement, maxDisplacement, 0.0f, 255.0f));
-            row[j][3] = 255;
-        }
-    }
-}
-
 int main(int argc, const char* argv[])
 {
    const char* keys =
@@ -186,12 +148,8 @@ int main(int argc, const char* argv[])

    // Sparse

-    PyrLKOpticalFlow d_pyrLK;
-
-    d_pyrLK.winSize.width = winSize;
-    d_pyrLK.winSize.height = winSize;
-    d_pyrLK.maxLevel = maxLevel;
-    d_pyrLK.iters = iters;
+    Ptr<cuda::SparsePyrLKOpticalFlow> d_pyrLK = cuda::SparsePyrLKOpticalFlow::create(
+                Size(winSize, winSize), maxLevel, iters);

    GpuMat d_frame0(frame0);
    GpuMat d_frame1(frame1);
@@ -199,7 +157,7 @@ int main(int argc, const char* argv[])
    GpuMat d_nextPts;
    GpuMat d_status;

-    d_pyrLK.sparse(useGray ? d_frame0Gray : d_frame0, useGray ? d_frame1Gray : d_frame1, d_prevPts, d_nextPts, d_status);
+    d_pyrLK->calc(useGray ? d_frame0Gray : d_frame0, useGray ? d_frame1Gray : d_frame1, d_prevPts, d_nextPts, d_status);

    // Draw arrows

@@ -216,20 +174,6 @@ int main(int argc, const char* argv[])

    imshow("PyrLK [Sparse]", frame0);

-    // Dense
-
-    GpuMat d_u;
-    GpuMat d_v;
-
-    d_pyrLK.dense(d_frame0Gray, d_frame1Gray, d_u, d_v);
-
-    // Draw flow field
-
-    Mat flowField;
-    getFlowField(Mat(d_u), Mat(d_v), flowField);
-
-    imshow("PyrLK [Dense] Flow Field", flowField);
-
    waitKey();

    return 0;