Merge moved code from opencv

7e569cec · Alexander Alekhin · 796853e0 · 71f588bd · 7e569cec · 7e569cec
Commit 7e569cec authored Feb 19, 2019 by Alexander Alekhin
4 changed files
--- a/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu
+++ b/modules/cudalegacy/src/cuda/NCVHaarObjectDetection.cu
@@ -59,8 +59,7 @@
 #include <algorithm>
 #include <cstdio>

-#include "opencv2/core/cuda/warp.hpp"
-#include "opencv2/core/cuda/warp_shuffle.hpp"
+#include "opencv2/cudev.hpp"

 #include "opencv2/opencv_modules.hpp"

@@ -77,92 +76,6 @@
 #include "NCVAlg.hpp"


-//==============================================================================
-//
-// BlockScan file
-//
-//==============================================================================
-
-
-NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive
-
-
-//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
-//assuming size <= WARP_SIZE and size is power of 2
-__device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data)
-{
-#if __CUDA_ARCH__ >= 300
-    const unsigned int laneId = cv::cuda::device::Warp::laneId();
-
-    // scan on shuffl functions
-    #pragma unroll
-    for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
-    {
-        const Ncv32u n = cv::cuda::device::shfl_up(idata, i);
-        if (laneId >= i)
-              idata += n;
-    }
-
-    return idata;
-#else
-    Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
-    s_Data[pos] = 0;
-    pos += K_WARP_SIZE;
-    s_Data[pos] = idata;
-
-    s_Data[pos] += s_Data[pos - 1];
-    s_Data[pos] += s_Data[pos - 2];
-    s_Data[pos] += s_Data[pos - 4];
-    s_Data[pos] += s_Data[pos - 8];
-    s_Data[pos] += s_Data[pos - 16];
-
-    return s_Data[pos];
-#endif
-}
-
-__device__ __forceinline__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)
-{
-    return warpScanInclusive(idata, s_Data) - idata;
-}
-
-template <Ncv32u tiNumScanThreads>
-__device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data)
-{
-    if (tiNumScanThreads > K_WARP_SIZE)
-    {
-        //Bottom-level inclusive warp scan
-        Ncv32u warpResult = warpScanInclusive(idata, s_Data);
-
-        //Save top elements of each warp for exclusive warp scan
-        //sync to wait for warp scans to complete (because s_Data is being overwritten)
-        __syncthreads();
-        if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
-        {
-            s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
-        }
-
-        //wait for warp scans to complete
-        __syncthreads();
-
-        if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
-        {
-            //grab top warp elements
-            Ncv32u val = s_Data[threadIdx.x];
-            //calculate exclusive scan and write back to shared memory
-            s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);
-        }
-
-        //return updated warp scans with exclusive scan results
-        __syncthreads();
-        return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];
-    }
-    else
-    {
-        return warpScanInclusive(idata, s_Data);
-    }
-}
-
-
 //==============================================================================
 //
 // HaarClassifierCascade file
@@ -260,11 +173,11 @@ __device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u
 {
 #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110

-    __shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL * 2];
+    __shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL];
    __shared__ Ncv32u numPassed;
    __shared__ Ncv32u outMaskOffset;

-    Ncv32u incScan = scan1Inclusive<NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem);
+    Ncv32u incScan = cv::cudev::blockScanInclusive<NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem, threadIdx.x);
    __syncthreads();

    if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1)

--- a/modules/cudalegacy/src/cuda/NPP_staging.cu
+++ b/modules/cudalegacy/src/cuda/NPP_staging.cu
@@ -45,8 +45,7 @@
 #include <vector>
 #include <cuda_runtime.h>

-#include "opencv2/core/cuda/warp.hpp"
-#include "opencv2/core/cuda/warp_shuffle.hpp"
+#include "opencv2/cudev.hpp"

 #include "opencv2/cudalegacy/NPP_staging.hpp"

@@ -81,111 +80,6 @@ cudaStream_t nppStSetActiveCUDAstream(cudaStream_t cudaStream)
 }


-//==============================================================================
-//
-// BlockScan.cuh
-//
-//==============================================================================
-
-
-NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive
-
-
-//Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
-//assuming size <= WARP_SIZE and size is power of 2
-template <class T>
-inline __device__ T warpScanInclusive(T idata, volatile T *s_Data)
-{
-#if __CUDA_ARCH__ >= 300
-    const unsigned int laneId = cv::cuda::device::Warp::laneId();
-
-    // scan on shuffl functions
-    #pragma unroll
-    for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
-    {
-        const T n = cv::cuda::device::shfl_up(idata, i);
-        if (laneId >= i)
-              idata += n;
-    }
-
-    return idata;
-#else
-    Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
-    s_Data[pos] = 0;
-    pos += K_WARP_SIZE;
-    s_Data[pos] = idata;
-
-    s_Data[pos] += s_Data[pos - 1];
-    s_Data[pos] += s_Data[pos - 2];
-    s_Data[pos] += s_Data[pos - 4];
-    s_Data[pos] += s_Data[pos - 8];
-    s_Data[pos] += s_Data[pos - 16];
-
-    return s_Data[pos];
-#endif
-}
-inline __device__ Ncv64u warpScanInclusive(Ncv64u idata, volatile Ncv64u *s_Data)
-{
-    Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
-    s_Data[pos] = 0;
-    pos += K_WARP_SIZE;
-    s_Data[pos] = idata;
-
-    s_Data[pos] += s_Data[pos - 1];
-    s_Data[pos] += s_Data[pos - 2];
-    s_Data[pos] += s_Data[pos - 4];
-    s_Data[pos] += s_Data[pos - 8];
-    s_Data[pos] += s_Data[pos - 16];
-
-    return s_Data[pos];
-}
-
-
-template <class T>
-inline __device__ T warpScanExclusive(T idata, volatile T *s_Data)
-{
-    return warpScanInclusive(idata, s_Data) - idata;
-}
-
-
-template <class T, Ncv32u tiNumScanThreads>
-inline __device__ T blockScanInclusive(T idata, volatile T *s_Data)
-{
-    if (tiNumScanThreads > K_WARP_SIZE)
-    {
-        //Bottom-level inclusive warp scan
-        T warpResult = warpScanInclusive(idata, s_Data);
-
-        //Save top elements of each warp for exclusive warp scan
-        //sync to wait for warp scans to complete (because s_Data is being overwritten)
-        __syncthreads();
-        if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
-        {
-            s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
-        }
-
-        //wait for warp scans to complete
-        __syncthreads();
-
-        if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
-        {
-            //grab top warp elements
-            T val = s_Data[threadIdx.x];
-            //calculate exclusive scan and write back to shared memory
-            s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);
-        }
-
-        //return updated warp scans with exclusive scan results
-        __syncthreads();
-        return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];
-    }
-    else
-    {
-        return warpScanInclusive(idata, s_Data);
-    }
-}
-
-
 //==============================================================================
 //
 // IntegralImage.cu
@@ -280,7 +174,7 @@ __global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u sr
    Ncv32u numBuckets = (srcWidth + NUM_SCAN_THREADS - 1) >> LOG2_NUM_SCAN_THREADS;
    Ncv32u offsetX = 0;

-    __shared__ T_out shmem[NUM_SCAN_THREADS * 2];
+    __shared__ T_out shmem[NUM_SCAN_THREADS];
    __shared__ T_out carryElem;
    carryElem = 0;
    __syncthreads();
@@ -301,7 +195,7 @@ __global__ void scanRows(T_in *d_src, Ncv32u texOffs, Ncv32u srcWidth, Ncv32u sr
        curElemMod = _scanElemOp<T_in, T_out>::scanElemOp<tbDoSqr>(curElem);

        //inclusive scan
-        curScanElem = blockScanInclusive<T_out, NUM_SCAN_THREADS>(curElemMod, shmem);
+        curScanElem = cv::cudev::blockScanInclusive<NUM_SCAN_THREADS>(curElemMod, shmem, threadIdx.x);

        if (curElemOffs <= srcWidth)
        {
@@ -1290,7 +1184,7 @@ __global__ void removePass1Scan(Ncv32u *d_src, Ncv32u srcLen,
        return;
    }

-    __shared__ Ncv32u shmem[NUM_REMOVE_THREADS * 2];
+    __shared__ Ncv32u shmem[NUM_REMOVE_THREADS];

    Ncv32u scanElem = 0;
    if (elemAddrIn < srcLen)
@@ -1305,7 +1199,7 @@ __global__ void removePass1Scan(Ncv32u *d_src, Ncv32u srcLen,
        }
    }

-    Ncv32u localScanInc = blockScanInclusive<Ncv32u, NUM_REMOVE_THREADS>(scanElem, shmem);
+    Ncv32u localScanInc = cv::cudev::blockScanInclusive<NUM_REMOVE_THREADS>(scanElem, shmem, threadIdx.x);
    __syncthreads();

    if (elemAddrIn < srcLen)

--- a/modules/cudastereo/src/cuda/stereobm.cu
+++ b/modules/cudastereo/src/cuda/stereobm.cu
@@ -71,48 +71,54 @@ namespace cv { namespace cuda { namespace device
        }

        template<int RADIUS>
-        __device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
+        __device__ unsigned int CalcSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X)
        {
            unsigned int cache = 0;
            unsigned int cache2 = 0;

-            for(int i = 1; i <= RADIUS; i++)
-                cache += col_ssd[i];
+            if (X < cwidth - RADIUS)
+            {
+                for(int i = 1; i <= RADIUS; i++)
+                    cache += col_ssd[i];

-            col_ssd_cache[0] = cache;
+                col_ssd_cache[0] = cache;
+            }

            __syncthreads();

-            if (threadIdx.x < BLOCK_W - RADIUS)
-                cache2 = col_ssd_cache[RADIUS];
-            else
-                for(int i = RADIUS + 1; i < (2 * RADIUS + 1); i++)
-                    cache2 += col_ssd[i];
+            if (X < cwidth - RADIUS)
+            {
+                if (threadIdx.x < BLOCK_W - RADIUS)
+                    cache2 = col_ssd_cache[RADIUS];
+                else
+                    for(int i = RADIUS + 1; i < (2 * RADIUS + 1); i++)
+                        cache2 += col_ssd[i];
+            }

            return col_ssd[0] + cache + cache2;
        }

        template<int RADIUS>
-        __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd)
+        __device__ uint2 MinSSD(volatile unsigned int *col_ssd_cache, volatile unsigned int *col_ssd, const int X)
        {
            unsigned int ssd[N_DISPARITIES];

            //See above:  #define COL_SSD_SIZE (BLOCK_W + 2 * RADIUS)
-            ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS));
+            ssd[0] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 0 * (BLOCK_W + 2 * RADIUS), X);
            __syncthreads();
-            ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS));
+            ssd[1] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 1 * (BLOCK_W + 2 * RADIUS), X);
            __syncthreads();
-            ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS));
+            ssd[2] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 2 * (BLOCK_W + 2 * RADIUS), X);
            __syncthreads();
-            ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS));
+            ssd[3] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 3 * (BLOCK_W + 2 * RADIUS), X);
            __syncthreads();
-            ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS));
+            ssd[4] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 4 * (BLOCK_W + 2 * RADIUS), X);
            __syncthreads();
-            ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS));
+            ssd[5] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 5 * (BLOCK_W + 2 * RADIUS), X);
            __syncthreads();
-            ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS));
+            ssd[6] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 6 * (BLOCK_W + 2 * RADIUS), X);
            __syncthreads();
-            ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS));
+            ssd[7] = CalcSSD<RADIUS>(col_ssd_cache, col_ssd + 7 * (BLOCK_W + 2 * RADIUS), X);

            int mssd = ::min(::min(::min(ssd[0], ssd[1]), ::min(ssd[4], ssd[5])), ::min(::min(ssd[2], ssd[3]), ::min(ssd[6], ssd[7])));

@@ -243,12 +249,12 @@ namespace cv { namespace cuda { namespace device

            unsigned int* minSSDImage = cminSSDImage + X + Y * cminSSD_step;
            unsigned char* disparImage = disp.data + X + Y * disp.step;
-         /*   if (X < cwidth)
-            {
-                unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
-                for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
-                    *ptr = 0xFFFFFFFF;
-            }*/
+            //if (X < cwidth)
+            //{
+            //    unsigned int *minSSDImage_end = minSSDImage + min(ROWSperTHREAD, cheight - Y) * minssd_step;
+            //    for(uint *ptr = minSSDImage; ptr != minSSDImage_end; ptr += minssd_step )
+            //        *ptr = 0xFFFFFFFF;
+            //}
            int end_row = ::min(ROWSperTHREAD, cheight - Y - RADIUS);
            int y_tex;
            int x_tex = X - RADIUS;
@@ -268,13 +274,27 @@ namespace cv { namespace cuda { namespace device

                __syncthreads(); //before MinSSD function

-                if (X < cwidth - RADIUS && Y < cheight - RADIUS)
+                if (Y < cheight - RADIUS)
                {
-                    uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
-                    if (minSSD.x < minSSDImage[0])
+                    uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd, X);
+
+                    // For threads that do not satisfy the if condition below("X < cwidth - RADIUS"), previously
+                    // computed "minSSD" value, which is the result of "MinSSD" function call, is not used at all.
+                    //
+                    // However, since the "MinSSD" function has "__syncthreads" call in its body, those threads
+                    // must also call "MinSSD" to avoid deadlock. (#13850)
+                    //
+                    // From CUDA 9, using "__syncwarp" with proper mask value instead of using "__syncthreads"
+                    // could be an option, but the shared memory access pattern does not allow this option,
+                    // resulting in race condition. (Checked via "cuda-memcheck --tool racecheck")
+
+                    if (X < cwidth - RADIUS)
                    {
-                        disparImage[0] = (unsigned char)(d + minSSD.y);
-                        minSSDImage[0] = minSSD.x;
+                        if (minSSD.x < minSSDImage[0])
+                        {
+                            disparImage[0] = (unsigned char)(d + minSSD.y);
+                            minSSDImage[0] = minSSD.x;
+                        }
                    }
                }

@@ -295,17 +315,34 @@ namespace cv { namespace cuda { namespace device

                    __syncthreads(); //before MinSSD function

-                    if (X < cwidth - RADIUS && row < cheight - RADIUS - Y)
+                    if (row < cheight - RADIUS - Y)
                    {
-                        int idx = row * cminSSD_step;
-                        uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd);
-                        if (minSSD.x < minSSDImage[idx])
+                        uint2 minSSD = MinSSD<RADIUS>(col_ssd_cache + threadIdx.x, col_ssd, X);
+
+                        // For threads that do not satisfy the if condition below("X < cwidth - RADIUS"), previously
+                        // computed "minSSD" value, which is the result of "MinSSD" function call, is not used at all.
+                        //
+                        // However, since the "MinSSD" function has "__syncthreads" call in its body, those threads
+                        // must also call "MinSSD" to avoid deadlock. (#13850)
+                        //
+                        // From CUDA 9, using "__syncwarp" with proper mask value instead of using "__syncthreads"
+                        // could be an option, but the shared memory access pattern does not allow this option,
+                        // resulting in race condition. (Checked via "cuda-memcheck --tool racecheck")
+
+                        if (X < cwidth - RADIUS)
                        {
-                            disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);
-                            minSSDImage[idx] = minSSD.x;
+                            int idx = row * cminSSD_step;
+                            if (minSSD.x < minSSDImage[idx])
+                            {
+                                disparImage[disp.step * row] = (unsigned char)(d + minSSD.y);
+                                minSSDImage[idx] = minSSD.x;
+                            }
                        }
                    }
                } // for row loop
+
+                __syncthreads(); // before initializing shared memory at the beginning of next loop
+
            } // for d loop
        }


--- a/modules/cudev/include/opencv2/cudev/warp/scan.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/scan.hpp
@@ -98,7 +98,7 @@ __device__ T warpScanInclusive(T data, volatile T* smem, uint tid)
    #pragma unroll
    for (int i = 1; i <= (WARP_SIZE / 2); i *= 2)
    {
-        const T val = shfl_up(data, i);
+        const T val = __shfl_up(data, i, WARP_SIZE);
        if (laneId >= i)
              data += val;
    }