Fixed bug #1759

6a9f9342 · Alexey Spizhevoy · 40d8d11f · 6a9f9342 · 6a9f9342 · 6a9f9342
Commit 6a9f9342 authored Apr 28, 2012 by Alexey Spizhevoy
5 changed files
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -1750,6 +1750,7 @@ public:
        useInitialFlow = false;
        minEigThreshold = 1e-4f;
        getMinEigenVals = false;
+        isDeviceArch11_ = !DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
    }
    void sparse(const GpuMat& prevImg, const GpuMat& nextImg, const GpuMat& prevPts, GpuMat& nextPts,
@@ -1796,6 +1797,8 @@ private:
    vector<GpuMat> uPyr_;
    vector<GpuMat> vPyr_;
+    bool isDeviceArch11_;
 };
@@ -1812,6 +1815,7 @@ public:
        polyN = 5;
        polySigma = 1.1;
        flags = 0;
+        isDeviceArch11_ = !DeviceInfo().supports(FEATURE_SET_COMPUTE_12);
    }
    int numLevels;
@@ -1859,6 +1863,8 @@ private:
    GpuMat frames_[2];
    GpuMat pyrLevel_[2], M_, bufM_, R_[2], blurredFrame_[2];
    std::vector<GpuMat> pyramid0_, pyramid1_;
+    bool isDeviceArch11_;
 };

--- a/modules/gpu/src/cuda/optical_flow_farneback.cu
+++ b/modules/gpu/src/cuda/optical_flow_farneback.cu
@@ -433,6 +433,25 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
    }
+    void boxFilter5Gpu_CC11(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream)
+    {
+        int height = src.rows / 5;
+        int width = src.cols;
+        dim3 block(128);
+        dim3 grid(divUp(width, block.x), divUp(height, block.y));
+        int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
+        float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
+        boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
+        cudaSafeCall(cudaGetLastError());
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
+    }
    __constant__ float c_gKer[MAX_KSIZE_HALF + 1];
    template <typename Border>
@@ -575,14 +594,14 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
    }
-    template <typename Border>
+    template <typename Border, int blockDimX>
    void gaussianBlur5Caller(
            const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream)
    {
        int height = src.rows / 5;
        int width = src.cols;
-        dim3 block(256);
+        dim3 block(blockDimX);
        dim3 grid(divUp(width, block.x), divUp(height, block.y));
        int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
        Border b(height, width);
@@ -603,8 +622,22 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
        static const caller_t callers[] =
        {
-            gaussianBlur5Caller<BrdReflect101<float> >,
+            gaussianBlur5Caller<BrdReflect101<float>,256>,
-            gaussianBlur5Caller<BrdReplicate<float> >,
+            gaussianBlur5Caller<BrdReplicate<float>,256>,
+        };
+        callers[borderMode](src, ksizeHalf, dst, stream);
+    }
+    void gaussianBlur5Gpu_CC11(
+            const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderMode, cudaStream_t stream)
+    {
+        typedef void (*caller_t)(const DevMem2Df, int, DevMem2Df, cudaStream_t);
+        static const caller_t callers[] =
+        {
+            gaussianBlur5Caller<BrdReflect101<float>,128>,
+            gaussianBlur5Caller<BrdReplicate<float>,128>,
        };
        callers[borderMode](src, ksizeHalf, dst, stream);

--- a/modules/gpu/src/cuda/pyrlk.cu
+++ b/modules/gpu/src/cuda/pyrlk.cu
@@ -181,6 +181,7 @@ namespace cv { namespace gpu { namespace device
            smem3[tid] = val3;
            __syncthreads();
+#if __CUDA_ARCH__ > 110
            if (tid < 128) 
            { 
                smem1[tid] = val1 += smem1[tid + 128]; 
@@ -188,6 +189,7 @@ namespace cv { namespace gpu { namespace device
                smem3[tid] = val3 += smem3[tid + 128]; 
            } 
            __syncthreads();
+#endif
            if (tid < 64) 
            { 
@@ -235,12 +237,14 @@ namespace cv { namespace gpu { namespace device
            smem2[tid] = val2;
            __syncthreads();
+#if __CUDA_ARCH__ > 110
            if (tid < 128) 
            { 
                smem1[tid] = val1 += smem1[tid + 128]; 
                smem2[tid] = val2 += smem2[tid + 128];  
            } 
            __syncthreads();
+#endif
            if (tid < 64) 
            { 
@@ -279,11 +283,13 @@ namespace cv { namespace gpu { namespace device
            smem1[tid] = val1;
            __syncthreads();
+#if __CUDA_ARCH__ > 110
            if (tid < 128) 
            { 
                smem1[tid] = val1 += smem1[tid + 128]; 
            } 
            __syncthreads();
+#endif
            if (tid < 64) 
            { 
@@ -310,9 +316,15 @@ namespace cv { namespace gpu { namespace device
        __global__ void lkSparse(const PtrStepb I, const PtrStepb J, const PtrStep<short> dIdx, const PtrStep<short> dIdy,
            const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
        {
+#if __CUDA_ARCH__ <= 110
+            __shared__ float smem1[128];
+            __shared__ float smem2[128];
+            __shared__ float smem3[128];
+#else
            __shared__ float smem1[256];
            __shared__ float smem2[256];
            __shared__ float smem3[256];
+#endif
            const int tid = threadIdx.y * blockDim.x + threadIdx.x;

--- a/modules/gpu/src/optical_flow_farneback.cpp
+++ b/modules/gpu/src/optical_flow_farneback.cpp
@@ -81,6 +81,8 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
    void boxFilter5Gpu(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream);
+    void boxFilter5Gpu_CC11(const DevMem2Df src, int ksizeHalf, DevMem2Df dst, cudaStream_t stream);
    void setGaussianBlurKernel(const float *gKer, int ksizeHalf);
    void gaussianBlurGpu(
@@ -89,6 +91,9 @@ namespace cv { namespace gpu { namespace device { namespace optflow_farneback
    void gaussianBlur5Gpu(
            const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderType, cudaStream_t stream);
+    void gaussianBlur5Gpu_CC11(
+            const DevMem2Df src, int ksizeHalf, DevMem2Df dst, int borderType, cudaStream_t stream);
 }}}} // namespace cv { namespace gpu { namespace device { namespace optflow_farneback
@@ -167,7 +172,10 @@ void cv::gpu::FarnebackOpticalFlow::updateFlow_boxFilter(
        const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat &flowy,
        GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
 {
+    if (!isDeviceArch11_)
        device::optflow_farneback::boxFilter5Gpu(M, blockSize/2, bufM, S(streams[0]));
+    else
+        device::optflow_farneback::boxFilter5Gpu_CC11(M, blockSize/2, bufM, S(streams[0]));
    swap(M, bufM);
    for (int i = 1; i < 5; ++i)
@@ -183,8 +191,12 @@ void cv::gpu::FarnebackOpticalFlow::updateFlow_gaussianBlur(
        const GpuMat& R0, const GpuMat& R1, GpuMat& flowx, GpuMat& flowy,
        GpuMat& M, GpuMat &bufM, int blockSize, bool updateMatrices, Stream streams[])
 {
+    if (!isDeviceArch11_)
        device::optflow_farneback::gaussianBlur5Gpu(
                    M, blockSize/2, bufM, BORDER_REPLICATE_GPU, S(streams[0]));
+    else
+        device::optflow_farneback::gaussianBlur5Gpu_CC11(
+                    M, blockSize/2, bufM, BORDER_REPLICATE_GPU, S(streams[0]));
    swap(M, bufM);
    device::optflow_farneback::updateFlowGpu(M, flowx, flowy, S(streams[0]));

--- a/modules/gpu/src/pyrlk.cpp
+++ b/modules/gpu/src/pyrlk.cpp
@@ -126,18 +126,19 @@ void cv::gpu::PyrLKOpticalFlow::buildImagePyramid(const GpuMat& img0, vector<Gpu
 namespace
 {
-    void calcPatchSize(cv::Size winSize, int cn, dim3& block, dim3& patch)
+    void calcPatchSize(cv::Size winSize, int cn, dim3& block, dim3& patch, bool isDeviceArch11)
    {
        winSize.width *= cn;
        if (winSize.width > 32 && winSize.width > 2 * winSize.height)
        {
-            block.x = 32;
+            block.x = isDeviceArch11 ? 16 : 32;
            block.y = 8;
        }
        else
        {
-            block.x = block.y = 16;
+            block.x = 16;
+            block.y = isDeviceArch11 ? 8 : 16;
        }
        patch.x = (winSize.width  + block.x - 1) / block.x;
@@ -166,7 +167,7 @@ void cv::gpu::PyrLKOpticalFlow::sparse(const GpuMat& prevImg, const GpuMat& next
    const int cn = prevImg.channels();
    dim3 block, patch;
-    calcPatchSize(winSize, cn, block, patch);
+    calcPatchSize(winSize, cn, block, patch, isDeviceArch11_);   
    CV_Assert(derivLambda >= 0);
    CV_Assert(maxLevel >= 0 && winSize.width > 2 && winSize.height > 2);