refactor cudaoptflow public API:

* use opaque algorithm interfaces * add stream support

refactor cudaoptflow public API:
* use opaque algorithm interfaces * add stream support
381216aa · Vladislav Vinogradov · 19c6bbe7 · 381216aa · 381216aa · 381216aa
Commit 381216aa authored Dec 31, 2014 by Vladislav Vinogradov
7 changed files
--- a/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
+++ b/modules/cudaoptflow/include/opencv2/cudaoptflow.hpp
--- a/modules/cudaoptflow/src/brox.cpp
+++ b/modules/cudaoptflow/src/brox.cpp
@@ -47,84 +47,148 @@ using namespace cv::cuda;

 #if !defined (HAVE_CUDA) || !defined (HAVE_OPENCV_CUDALEGACY) || defined (CUDA_DISABLER)

-void cv::cuda::BroxOpticalFlow::operator ()(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, Stream&) { throw_no_cuda(); }
+Ptr<BroxOpticalFlow> cv::cuda::BroxOpticalFlow::create(double, double, double, int, int, int) { throw_no_cuda(); return Ptr<BroxOpticalFlow>(); }

 #else

-namespace
-{
-    size_t getBufSize(const NCVBroxOpticalFlowDescriptor& desc, const NCVMatrix<Ncv32f>& frame0, const NCVMatrix<Ncv32f>& frame1,
-                      NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v, const cudaDeviceProp& devProp)
+namespace {
+
+    class BroxOpticalFlowImpl : public BroxOpticalFlow
    {
-        NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(devProp.textureAlignment));
+    public:
+        BroxOpticalFlowImpl(double alpha, double gamma, double scale_factor,
+                            int inner_iterations, int outer_iterations, int solver_iterations) :
+            alpha_(alpha), gamma_(gamma), scale_factor_(scale_factor),
+            inner_iterations_(inner_iterations), outer_iterations_(outer_iterations),
+            solver_iterations_(solver_iterations)
+        {
+        }
+
+        virtual void calc(InputArray I0, InputArray I1, InputOutputArray flow, Stream& stream);
+
+        virtual double getFlowSmoothness() const { return alpha_; }
+        virtual void setFlowSmoothness(double alpha) { alpha_ = static_cast<float>(alpha); }
+
+        virtual double getGradientConstancyImportance() const { return gamma_; }
+        virtual void setGradientConstancyImportance(double gamma) { gamma_ = static_cast<float>(gamma); }
+
+        virtual double getPyramidScaleFactor() const { return scale_factor_; }
+        virtual void setPyramidScaleFactor(double scale_factor) { scale_factor_ = static_cast<float>(scale_factor); }
+
+        //! number of lagged non-linearity iterations (inner loop)
+        virtual int getInnerIterations() const { return inner_iterations_; }
+        virtual void setInnerIterations(int inner_iterations) { inner_iterations_ = inner_iterations; }
+
+        //! number of warping iterations (number of pyramid levels)
+        virtual int getOuterIterations() const { return outer_iterations_; }
+        virtual void setOuterIterations(int outer_iterations) { outer_iterations_ = outer_iterations; }
+
+        //! number of linear system solver iterations
+        virtual int getSolverIterations() const { return solver_iterations_; }
+        virtual void setSolverIterations(int solver_iterations) { solver_iterations_ = solver_iterations; }
+
+    private:
+        //! flow smoothness
+        float alpha_;
+
+        //! gradient constancy importance
+        float gamma_;
+
+        //! pyramid scale factor
+        float scale_factor_;
+
+        //! number of lagged non-linearity iterations (inner loop)
+        int inner_iterations_;
+
+        //! number of warping iterations (number of pyramid levels)
+        int outer_iterations_;
+
+        //! number of linear system solver iterations
+        int solver_iterations_;
+    };
+
+    static size_t getBufSize(const NCVBroxOpticalFlowDescriptor& desc,
+                             const NCVMatrix<Ncv32f>& frame0, const NCVMatrix<Ncv32f>& frame1,
+                             NCVMatrix<Ncv32f>& u, NCVMatrix<Ncv32f>& v,
+                             size_t textureAlignment)
+    {
+        NCVMemStackAllocator gpuCounter(static_cast<Ncv32u>(textureAlignment));

        ncvSafeCall( NCVBroxOpticalFlow(desc, gpuCounter, frame0, frame1, u, v, 0) );

        return gpuCounter.maxSize();
    }
-}

-namespace
-{
-    static void outputHandler(const String &msg) { CV_Error(cv::Error::GpuApiCallError, msg.c_str()); }
-}
+    static void outputHandler(const String &msg)
+    {
+        CV_Error(cv::Error::GpuApiCallError, msg.c_str());
+    }

-void cv::cuda::BroxOpticalFlow::operator ()(const GpuMat& frame0, const GpuMat& frame1, GpuMat& u, GpuMat& v, Stream& s)
-{
-    ncvSetDebugOutputHandler(outputHandler);
+    void BroxOpticalFlowImpl::calc(InputArray _I0, InputArray _I1, InputOutputArray _flow, Stream& stream)
+    {
+        const GpuMat frame0 = _I0.getGpuMat();
+        const GpuMat frame1 = _I1.getGpuMat();

-    CV_Assert(frame0.type() == CV_32FC1);
-    CV_Assert(frame1.size() == frame0.size() && frame1.type() == frame0.type());
+        CV_Assert( frame0.type() == CV_32FC1 );
+        CV_Assert( frame1.size() == frame0.size() && frame1.type() == frame0.type() );

-    u.create(frame0.size(), CV_32FC1);
-    v.create(frame0.size(), CV_32FC1);
+        ncvSetDebugOutputHandler(outputHandler);

-    cudaDeviceProp devProp;
-    cudaSafeCall( cudaGetDeviceProperties(&devProp, getDevice()) );
+        BufferPool pool(stream);
+        GpuMat u = pool.getBuffer(frame0.size(), CV_32FC1);
+        GpuMat v = pool.getBuffer(frame0.size(), CV_32FC1);

-    NCVBroxOpticalFlowDescriptor desc;
+        NCVBroxOpticalFlowDescriptor desc;
+        desc.alpha = alpha_;
+        desc.gamma = gamma_;
+        desc.scale_factor = scale_factor_;
+        desc.number_of_inner_iterations = inner_iterations_;
+        desc.number_of_outer_iterations = outer_iterations_;
+        desc.number_of_solver_iterations = solver_iterations_;

-    desc.alpha = alpha;
-    desc.gamma = gamma;
-    desc.scale_factor = scale_factor;
-    desc.number_of_inner_iterations = inner_iterations;
-    desc.number_of_outer_iterations = outer_iterations;
-    desc.number_of_solver_iterations = solver_iterations;
+        NCVMemSegment frame0MemSeg;
+        frame0MemSeg.begin.memtype = NCVMemoryTypeDevice;
+        frame0MemSeg.begin.ptr = const_cast<uchar*>(frame0.data);
+        frame0MemSeg.size = frame0.step * frame0.rows;

-    NCVMemSegment frame0MemSeg;
-    frame0MemSeg.begin.memtype = NCVMemoryTypeDevice;
-    frame0MemSeg.begin.ptr = const_cast<uchar*>(frame0.data);
-    frame0MemSeg.size = frame0.step * frame0.rows;
+        NCVMemSegment frame1MemSeg;
+        frame1MemSeg.begin.memtype = NCVMemoryTypeDevice;
+        frame1MemSeg.begin.ptr = const_cast<uchar*>(frame1.data);
+        frame1MemSeg.size = frame1.step * frame1.rows;

-    NCVMemSegment frame1MemSeg;
-    frame1MemSeg.begin.memtype = NCVMemoryTypeDevice;
-    frame1MemSeg.begin.ptr = const_cast<uchar*>(frame1.data);
-    frame1MemSeg.size = frame1.step * frame1.rows;
+        NCVMemSegment uMemSeg;
+        uMemSeg.begin.memtype = NCVMemoryTypeDevice;
+        uMemSeg.begin.ptr = u.ptr();
+        uMemSeg.size = u.step * u.rows;

-    NCVMemSegment uMemSeg;
-    uMemSeg.begin.memtype = NCVMemoryTypeDevice;
-    uMemSeg.begin.ptr = u.ptr();
-    uMemSeg.size = u.step * u.rows;
+        NCVMemSegment vMemSeg;
+        vMemSeg.begin.memtype = NCVMemoryTypeDevice;
+        vMemSeg.begin.ptr = v.ptr();
+        vMemSeg.size = v.step * v.rows;

-    NCVMemSegment vMemSeg;
-    vMemSeg.begin.memtype = NCVMemoryTypeDevice;
-    vMemSeg.begin.ptr = v.ptr();
-    vMemSeg.size = v.step * v.rows;
+        DeviceInfo devInfo;
+        size_t textureAlignment = devInfo.textureAlignment();

-    NCVMatrixReuse<Ncv32f> frame0Mat(frame0MemSeg, static_cast<Ncv32u>(devProp.textureAlignment), frame0.cols, frame0.rows, static_cast<Ncv32u>(frame0.step));
-    NCVMatrixReuse<Ncv32f> frame1Mat(frame1MemSeg, static_cast<Ncv32u>(devProp.textureAlignment), frame1.cols, frame1.rows, static_cast<Ncv32u>(frame1.step));
-    NCVMatrixReuse<Ncv32f> uMat(uMemSeg, static_cast<Ncv32u>(devProp.textureAlignment), u.cols, u.rows, static_cast<Ncv32u>(u.step));
-    NCVMatrixReuse<Ncv32f> vMat(vMemSeg, static_cast<Ncv32u>(devProp.textureAlignment), v.cols, v.rows, static_cast<Ncv32u>(v.step));
+        NCVMatrixReuse<Ncv32f> frame0Mat(frame0MemSeg, static_cast<Ncv32u>(textureAlignment), frame0.cols, frame0.rows, static_cast<Ncv32u>(frame0.step));
+        NCVMatrixReuse<Ncv32f> frame1Mat(frame1MemSeg, static_cast<Ncv32u>(textureAlignment), frame1.cols, frame1.rows, static_cast<Ncv32u>(frame1.step));
+        NCVMatrixReuse<Ncv32f> uMat(uMemSeg, static_cast<Ncv32u>(textureAlignment), u.cols, u.rows, static_cast<Ncv32u>(u.step));
+        NCVMatrixReuse<Ncv32f> vMat(vMemSeg, static_cast<Ncv32u>(textureAlignment), v.cols, v.rows, static_cast<Ncv32u>(v.step));

-    cudaStream_t stream = StreamAccessor::getStream(s);
+        size_t bufSize = getBufSize(desc, frame0Mat, frame1Mat, uMat, vMat, textureAlignment);
+        GpuMat buf = pool.getBuffer(1, static_cast<int>(bufSize), CV_8UC1);

-    size_t bufSize = getBufSize(desc, frame0Mat, frame1Mat, uMat, vMat, devProp);
+        NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(textureAlignment), buf.ptr());

-    ensureSizeIsEnough(1, static_cast<int>(bufSize), CV_8UC1, buf);
+        ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, StreamAccessor::getStream(stream)) );

-    NCVMemStackAllocator gpuAllocator(NCVMemoryTypeDevice, bufSize, static_cast<Ncv32u>(devProp.textureAlignment), buf.ptr());
+        GpuMat flows[] = {u, v};
+        cuda::merge(flows, 2, _flow, stream);
+    }
+}

-    ncvSafeCall( NCVBroxOpticalFlow(desc, gpuAllocator, frame0Mat, frame1Mat, uMat, vMat, stream) );
+Ptr<BroxOpticalFlow> cv::cuda::BroxOpticalFlow::create(double alpha, double gamma, double scale_factor, int inner_iterations, int outer_iterations, int solver_iterations)
+{
+    return makePtr<BroxOpticalFlowImpl>(alpha, gamma, scale_factor, inner_iterations, outer_iterations, solver_iterations);
 }

 #endif /* HAVE_CUDA */
--- a/modules/cudaoptflow/src/cuda/pyrlk.cu
+++ b/modules/cudaoptflow/src/cuda/pyrlk.cu
@@ -472,16 +472,16 @@ namespace pyrlk
        }
    }

-    void loadConstants(int2 winSize, int iters)
+    void loadConstants(int2 winSize, int iters, cudaStream_t stream)
    {
-        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_x, &winSize.x, sizeof(int)) );
-        cudaSafeCall( cudaMemcpyToSymbol(c_winSize_y, &winSize.y, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_x, &winSize.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_y, &winSize.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );

        int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
-        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_x, &halfWin.x, sizeof(int)) );
-        cudaSafeCall( cudaMemcpyToSymbol(c_halfWin_y, &halfWin.y, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_x, &halfWin.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_y, &halfWin.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );

-        cudaSafeCall( cudaMemcpyToSymbol(c_iters, &iters, sizeof(int)) );
+        cudaSafeCall( cudaMemcpyToSymbolAsync(c_iters, &iters, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
    }

    void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,

--- a/modules/cudaoptflow/src/cuda/tvl1flow.cu
+++ b/modules/cudaoptflow/src/cuda/tvl1flow.cu
@@ -66,15 +66,16 @@ namespace tvl1flow
        dy(y, x) = 0.5f * (src(::min(y + 1, src.rows - 1), x) - src(::max(y - 1, 0), x));
    }

-    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy)
+    void centeredGradient(PtrStepSzf src, PtrStepSzf dx, PtrStepSzf dy, cudaStream_t stream)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));

-        centeredGradientKernel<<<grid, block>>>(src, dx, dy);
+        centeredGradientKernel<<<grid, block, 0, stream>>>(src, dx, dy);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
 }

@@ -164,7 +165,10 @@ namespace tvl1flow
        rho(y, x) = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;
    }

-    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y, PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx, PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho)
+    void warpBackward(PtrStepSzf I0, PtrStepSzf I1, PtrStepSzf I1x, PtrStepSzf I1y,
+                      PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf I1w, PtrStepSzf I1wx,
+                      PtrStepSzf I1wy, PtrStepSzf grad, PtrStepSzf rho,
+                      cudaStream_t stream)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(I0.cols, block.x), divUp(I0.rows, block.y));
@@ -173,10 +177,11 @@ namespace tvl1flow
        bindTexture(&tex_I1x, I1x);
        bindTexture(&tex_I1y, I1y);

-        warpBackwardKernel<<<grid, block>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho);
+        warpBackwardKernel<<<grid, block, 0, stream>>>(I0, u1, u2, I1w, I1wx, I1wy, grad, rho);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
 }

@@ -292,15 +297,17 @@ namespace tvl1flow
                   PtrStepSzf grad, PtrStepSzf rho_c,
                   PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
                   PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf error,
-                   float l_t, float theta, float gamma, bool calcError)
+                   float l_t, float theta, float gamma, bool calcError,
+                   cudaStream_t stream)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(I1wx.cols, block.x), divUp(I1wx.rows, block.y));

-        estimateUKernel<<<grid, block>>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, error, l_t, theta, gamma, calcError);
+        estimateUKernel<<<grid, block, 0, stream>>>(I1wx, I1wy, grad, rho_c, p11, p12, p21, p22, p31, p32, u1, u2, u3, error, l_t, theta, gamma, calcError);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
 }

@@ -346,15 +353,19 @@ namespace tvl1flow
        }
    }

-    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3, PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32, float taut, float gamma)
+    void estimateDualVariables(PtrStepSzf u1, PtrStepSzf u2, PtrStepSzf u3,
+                               PtrStepSzf p11, PtrStepSzf p12, PtrStepSzf p21, PtrStepSzf p22, PtrStepSzf p31, PtrStepSzf p32,
+                               float taut, float gamma,
+                               cudaStream_t stream)
    {
        const dim3 block(32, 8);
        const dim3 grid(divUp(u1.cols, block.x), divUp(u1.rows, block.y));

-        estimateDualVariablesKernel<<<grid, block>>>(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma);
+        estimateDualVariablesKernel<<<grid, block, 0, stream>>>(u1, u2, u3, p11, p12, p21, p22, p31, p32, taut, gamma);
        cudaSafeCall( cudaGetLastError() );

-        cudaSafeCall( cudaDeviceSynchronize() );
+        if (!stream)
+            cudaSafeCall( cudaDeviceSynchronize() );
    }
 }


--- a/modules/cudaoptflow/src/farneback.cpp
+++ b/modules/cudaoptflow/src/farneback.cpp
--- a/modules/cudaoptflow/src/pyrlk.cpp
+++ b/modules/cudaoptflow/src/pyrlk.cpp
--- a/modules/cudaoptflow/src/tvl1flow.cpp
+++ b/modules/cudaoptflow/src/tvl1flow.cpp