Merge pull request #13658 from nglee:dev_CudaShflUpCompat

dcdbaef3 · Alexander Alekhin · 20d82cf6 · 970293a2 · dcdbaef3 · dcdbaef3
Commit dcdbaef3 authored Jan 21, 2019 by Alexander Alekhin
6 changed files
--- a/modules/cudaimgproc/src/cuda/clahe.cu
+++ b/modules/cudaimgproc/src/cuda/clahe.cu
@@ -42,15 +42,9 @@
 #if !defined CUDA_DISABLER
-#include "opencv2/core/cuda/common.hpp"
+#include "opencv2/cudev.hpp"
-#include "opencv2/core/cuda/functional.hpp"
-#include "opencv2/core/cuda/emulation.hpp"
-#include "opencv2/core/cuda/scan.hpp"
-#include "opencv2/core/cuda/reduce.hpp"
-#include "opencv2/core/cuda/saturate_cast.hpp"
-using namespace cv::cuda;
+using namespace cv::cudev;
-using namespace cv::cuda::device;
 namespace clahe
 {
@@ -73,7 +67,7 @@ namespace clahe
            for (int j = threadIdx.x; j < tileSize.x; j += blockDim.x)
            {
                const int data = srcPtr[j];
-                Emulation::smem::atomicAdd(&smem[data], 1);
+                ::atomicAdd(&smem[data], 1);
            }
        }
@@ -96,7 +90,7 @@ namespace clahe
            // find number of overall clipped samples
-            reduce<256>(smem, clipped, tid, plus<int>());
+            blockReduce<256>(smem, clipped, tid, plus<int>());
            // broadcast evaluated value
@@ -128,10 +122,10 @@ namespace clahe
        calcLutKernel<<<grid, block, 0, stream>>>(src, lut, tileSize, tilesX, clipLimit, lutScale);
-        cudaSafeCall( cudaGetLastError() );
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
    }
    __global__ void transformKernel(const PtrStepSzb src, PtrStepb dst, const PtrStepb lut, const int2 tileSize, const int tilesX, const int tilesY)
@@ -173,13 +167,13 @@ namespace clahe
        const dim3 block(32, 8);
        const dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
-        cudaSafeCall( cudaFuncSetCacheConfig(transformKernel, cudaFuncCachePreferL1) );
+        CV_CUDEV_SAFE_CALL( cudaFuncSetCacheConfig(transformKernel, cudaFuncCachePreferL1) );
        transformKernel<<<grid, block, 0, stream>>>(src, dst, lut, tileSize, tilesX, tilesY);
-        cudaSafeCall( cudaGetLastError() );
+        CV_CUDEV_SAFE_CALL( cudaGetLastError() );
        if (stream == 0)
-            cudaSafeCall( cudaDeviceSynchronize() );
+            CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
    }
 }

--- a/modules/cudev/include/opencv2/cudev/block/scan.hpp
+++ b/modules/cudev/include/opencv2/cudev/block/scan.hpp
@@ -48,12 +48,134 @@
 #include "../common.hpp"
 #include "../warp/scan.hpp"
+#include "../warp/warp.hpp"
 namespace cv { namespace cudev {
 //! @addtogroup cudev
 //! @{
+#if __CUDACC_VER_MAJOR__ >= 9
+// Usage Note
+// - THREADS_NUM should be equal to the number of threads in this block.
+// - smem must be able to contain at least n elements of type T, where n is equal to the number
+//   of warps in this block. The number can be calculated by divUp(THREADS_NUM, WARP_SIZE).
+//
+// Dev Note
+// - Starting from CUDA 9.0, support for Fermi is dropped. So CV_CUDEV_ARCH >= 300 is implied.
+// - "For Pascal and earlier architectures (CV_CUDEV_ARCH < 700), all threads in mask must execute
+//    the same warp intrinsic instruction in convergence, and the union of all values in mask must
+//    be equal to the warp's active mask."
+//   (https://docs.nvidia.com/cuda/archive/10.0/cuda-c-programming-guide#independent-thread-scheduling-7-x)
+// - Above restriction does not apply starting from Volta (CV_CUDEV_ARCH >= 700). We just need to
+//   take care so that "all non-exited threads named in mask must execute the same intrinsic with
+//   the same mask."
+//   (https://docs.nvidia.com/cuda/archive/10.0/cuda-c-programming-guide#warp-description)
+template <int THREADS_NUM, typename T>
+__device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
+{
+    const int residual = THREADS_NUM & (WARP_SIZE - 1);
+#if CV_CUDEV_ARCH < 700
+    const uint residual_mask = (1U << residual) - 1;
+#endif
+    if (THREADS_NUM > WARP_SIZE)
+    {
+        // bottom-level inclusive warp scan
+    #if CV_CUDEV_ARCH >= 700
+        T warpResult = warpScanInclusive(0xFFFFFFFFU, data);
+    #else
+        T warpResult;
+        if (0 == residual)
+            warpResult = warpScanInclusive(0xFFFFFFFFU, data);
+        else
+        {
+            const int n_warps = divUp(THREADS_NUM, WARP_SIZE);
+            const int warp_num = Warp::warpId();
+            if (warp_num < n_warps - 1)
+                warpResult = warpScanInclusive(0xFFFFFFFFU, data);
+            else
+            {
+                // We are at the last threads of a block whose number of threads
+                // is not a multiple of the warp size
+                warpResult = warpScanInclusive(residual_mask, data);
+            }
+        }
+    #endif
+        __syncthreads();
+        // save top elements of each warp for exclusive warp scan
+        // sync to wait for warp scans to complete (because smem is being overwritten)
+        if ((tid & (WARP_SIZE - 1)) == (WARP_SIZE - 1))
+        {
+            smem[tid >> LOG_WARP_SIZE] = warpResult;
+        }
+        __syncthreads();
+        int quot = THREADS_NUM / WARP_SIZE;
+        if (tid < quot)
+        {
+            // grab top warp elements
+            T val = smem[tid];
+            uint mask = (1LLU << quot) - 1;
+            if (0 == residual)
+            {
+                // calculate exclusive scan and write back to shared memory
+                smem[tid] = warpScanExclusive(mask, val);
+            }
+            else
+            {
+                // calculate inclusive scan and write back to shared memory with offset 1
+                smem[tid + 1] = warpScanInclusive(mask, val);
+                if (tid == 0)
+                    smem[0] = 0;
+            }
+        }
+        __syncthreads();
+        // return updated warp scans
+        return warpResult + smem[tid >> LOG_WARP_SIZE];
+    }
+    else
+    {
+    #if CV_CUDEV_ARCH >= 700
+        return warpScanInclusive(0xFFFFFFFFU, data);
+    #else
+        if (THREADS_NUM == WARP_SIZE)
+            return warpScanInclusive(0xFFFFFFFFU, data);
+        else
+            return warpScanInclusive(residual_mask, data);
+    #endif
+    }
+}
+template <int THREADS_NUM, typename T>
+__device__ __forceinline__ T blockScanExclusive(T data, volatile T* smem, uint tid)
+{
+    return blockScanInclusive<THREADS_NUM>(data, smem, tid) - data;
+}
+#else // __CUDACC_VER_MAJOR__ >= 9
+// Usage Note
+// - THREADS_NUM should be equal to the number of threads in this block.
+// - (>= Kepler) smem must be able to contain at least n elements of type T, where n is equal to the number
+//   of warps in this block. The number can be calculated by divUp(THREADS_NUM, WARP_SIZE).
+// - (Fermi) smem must be able to contain at least n elements of type T, where n is equal to the number
+//   of threads in this block (= THREADS_NUM).
 template <int THREADS_NUM, typename T>
 __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
 {
@@ -73,18 +195,31 @@ __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
        __syncthreads();
-        if (tid < (THREADS_NUM / WARP_SIZE))
+        int quot = THREADS_NUM / WARP_SIZE;
+        if (tid < quot)
        {
            // grab top warp elements
            T val = smem[tid];
-            // calculate exclusive scan and write back to shared memory
+            if (0 == (THREADS_NUM & (WARP_SIZE - 1)))
-            smem[tid] = warpScanExclusive(val, smem, tid);
+            {
+                // calculate exclusive scan and write back to shared memory
+                smem[tid] = warpScanExclusive(val, smem, tid);
+            }
+            else
+            {
+                // calculate inclusive scan and write back to shared memory with offset 1
+                smem[tid + 1] = warpScanInclusive(val, smem, tid);
+                if (tid == 0)
+                    smem[0] = 0;
+            }
        }
        __syncthreads();
-        // return updated warp scans with exclusive scan results
+        // return updated warp scans
        return warpResult + smem[tid >> LOG_WARP_SIZE];
    }
    else
@@ -99,6 +234,8 @@ __device__ __forceinline__ T blockScanExclusive(T data, volatile T* smem, uint t
    return blockScanInclusive<THREADS_NUM>(data, smem, tid) - data;
 }
+#endif // __CUDACC_VER_MAJOR__ >= 9
 //! @}
 }}

--- a/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
@@ -215,7 +215,7 @@ namespace integral_detail
        #pragma unroll
        for (int i = 1; i < 32; i *= 2)
        {
-            const int n = shfl_up(sum, i, 32);
+            const int n = compatible_shfl_up(sum, i, 32);
            if (lane_id >= i)
            {
@@ -245,9 +245,9 @@ namespace integral_detail
            int warp_sum = sums[lane_id];
            #pragma unroll
-            for (int i = 1; i <= 32; i *= 2)
+            for (int i = 1; i < 32; i *= 2)
            {
-                const int n = shfl_up(warp_sum, i, 32);
+                const int n = compatible_shfl_up(warp_sum, i, 32);
                if (lane_id >= i)
                    warp_sum += n;
@@ -453,7 +453,7 @@ namespace integral_detail
            for (int i = 1; i <= 8; i *= 2)
            {
-                T n = shfl_up(partial_sum, i, 32);
+                T n = compatible_shfl_up(partial_sum, i, 32);
                if (lane_id >= i)
                    partial_sum += n;

--- a/modules/cudev/include/opencv2/cudev/warp/scan.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/scan.hpp
@@ -55,6 +55,36 @@ namespace cv { namespace cudev {
 //! @addtogroup cudev
 //! @{
+#if __CUDACC_VER_MAJOR__ >= 9
+// Starting from CUDA 9.0, support for Fermi is dropped.
+// So CV_CUDEV_ARCH >= 300 is implied.
+template <typename T>
+__device__ T warpScanInclusive(uint mask, T data)
+{
+    const uint laneId = Warp::laneId();
+    // scan on shufl functions
+    #pragma unroll
+    for (int i = 1; i <= (WARP_SIZE / 2); i *= 2)
+    {
+        const T val = shfl_up_sync(mask, data, i);
+        if (laneId >= i)
+              data += val;
+    }
+    return data;
+}
+template <typename T>
+__device__ __forceinline__ T warpScanExclusive(uint mask, T data)
+{
+    return warpScanInclusive(mask, data) - data;
+}
+#else // __CUDACC_VER_MAJOR__ >= 9
 template <typename T>
 __device__ T warpScanInclusive(T data, volatile T* smem, uint tid)
 {
@@ -75,19 +105,16 @@ __device__ T warpScanInclusive(T data, volatile T* smem, uint tid)
    return data;
 #else
-    uint pos = 2 * tid - (tid & (WARP_SIZE - 1));
+    const uint laneId = Warp::laneId();
-    smem[pos] = 0;
-    pos += WARP_SIZE;
+    smem[tid] = data;
-    smem[pos] = data;
-    smem[pos] += smem[pos - 1];
+    #pragma unroll
-    smem[pos] += smem[pos - 2];
+    for (int i = 1; i <= (WARP_SIZE / 2); i *= 2)
-    smem[pos] += smem[pos - 4];
+        if (laneId >= i)
-    smem[pos] += smem[pos - 8];
+            smem[tid] += smem[tid - i];
-    smem[pos] += smem[pos - 16];
-    return smem[pos];
+    return smem[tid];
 #endif
 }
@@ -97,6 +124,8 @@ __device__ __forceinline__ T warpScanExclusive(T data, volatile T* smem, uint ti
    return warpScanInclusive(data, smem, tid) - data;
 }
+#endif // __CUDACC_VER_MAJOR__ >= 9
 //! @}
 }}

--- a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
@@ -48,6 +48,8 @@
 #include "../common.hpp"
 #include "../util/vec_traits.hpp"
+#include "../block/block.hpp"
+#include "warp.hpp"
 namespace cv { namespace cudev {
@@ -59,7 +61,7 @@ namespace cv { namespace cudev {
 #if __CUDACC_VER_MAJOR__ >= 9
 #  define __shfl(x, y, z) __shfl_sync(0xFFFFFFFFU, x, y, z)
 #  define __shfl_xor(x, y, z) __shfl_xor_sync(0xFFFFFFFFU, x, y, z)
-#  define __shfl_up(x, y, z) __shfl_up_sync(0xFFFFFFFFU, x, y, z)
+//#  define __shfl_up(x, y, z) __shfl_up_sync(0xFFFFFFFFU, x, y, z)
 #  define __shfl_down(x, y, z) __shfl_down_sync(0xFFFFFFFFU, x, y, z)
 #endif
@@ -155,6 +157,53 @@ CV_CUDEV_SHFL_VEC_INST(double)
 // shfl_up
+template <typename T>
+__device__ __forceinline__ T compatible_shfl_up(T val, uint delta, int width = warpSize)
+{
+#if __CUDACC_VER_MAJOR__ < 9
+    return shfl_up(val, delta, width);
+#else // __CUDACC_VER_MAJOR__ < 9
+#if CV_CUDEV_ARCH >= 700
+    return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
+#else
+    const int block_size = Block::blockSize();
+    const int residual = block_size & (warpSize - 1);
+    if (0 == residual)
+        return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
+    else
+    {
+        const int n_warps = divUp(block_size, warpSize);
+        const int warp_id = Warp::warpId();
+        if (warp_id < n_warps - 1)
+            return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
+        else
+        {
+            // We are at the last threads of a block whose number of threads
+            // is not a multiple of the warp size
+            uint mask = (1LU << residual) - 1;
+            return shfl_up_sync(mask, val, delta, width);
+        }
+    }
+#endif
+#endif // __CUDACC_VER_MAJOR__ < 9
+}
+#if __CUDACC_VER_MAJOR__ >= 9
+template <typename T>
+__device__ __forceinline__ T shfl_up_sync(uint mask, T val, uint delta, int width = warpSize)
+{
+    return (T) __shfl_up_sync(mask, val, delta, width);
+}
+#else
 __device__ __forceinline__ uchar shfl_up(uchar val, uint delta, int width = warpSize)
 {
    return (uchar) __shfl_up((int) val, delta, width);
@@ -244,6 +293,8 @@ CV_CUDEV_SHFL_UP_VEC_INST(double)
 #undef CV_CUDEV_SHFL_UP_VEC_INST
+#endif
 // shfl_down
 __device__ __forceinline__ uchar shfl_down(uchar val, uint delta, int width = warpSize)

--- a/modules/cudev/test/test_scan.cu
+++ b/modules/cudev/test/test_scan.cu
+#include "test_precomp.hpp"
+using namespace cv;
+using namespace cv::cudev;
+using namespace cvtest;
+// BlockScanInt
+template <int THREADS_NUM>
+__global__ void int_kernel(int* data)
+{
+    uint tid = Block::threadLineId();
+#if CV_CUDEV_ARCH >= 300
+    const int n_warps = (THREADS_NUM - 1) / WARP_SIZE + 1;
+    __shared__ int smem[n_warps];
+#else
+    __shared__ int smem[THREADS_NUM];
+#endif
+    data[tid] = blockScanInclusive<THREADS_NUM>(data[tid], smem, tid);
+}
+#define BLOCK_SCAN_INT_TEST(block_size)                                 \
+    TEST(BlockScanInt, BlockSize##block_size)                           \
+    {                                                                   \
+        Mat src = randomMat(Size(block_size, 1), CV_32SC1, 0, 1024);    \
+                                                                        \
+        GpuMat d_src;                                                   \
+        d_src.upload(src);                                              \
+                                                                        \
+        for (int col = 1; col < block_size; col++)                      \
+            src.at<int>(0, col) += src.at<int>(0, col - 1);             \
+                                                                        \
+        int_kernel<block_size><<<1, block_size>>>((int*)d_src.data);    \
+                                                                        \
+        CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());                    \
+                                                                        \
+        EXPECT_MAT_NEAR(d_src, src, 0);                                 \
+    }
+BLOCK_SCAN_INT_TEST(29)
+BLOCK_SCAN_INT_TEST(30)
+BLOCK_SCAN_INT_TEST(32)
+BLOCK_SCAN_INT_TEST(40)
+BLOCK_SCAN_INT_TEST(41)
+BLOCK_SCAN_INT_TEST(59)
+BLOCK_SCAN_INT_TEST(60)
+BLOCK_SCAN_INT_TEST(64)
+BLOCK_SCAN_INT_TEST(70)
+BLOCK_SCAN_INT_TEST(71)
+BLOCK_SCAN_INT_TEST(109)
+BLOCK_SCAN_INT_TEST(110)
+BLOCK_SCAN_INT_TEST(128)
+BLOCK_SCAN_INT_TEST(130)
+BLOCK_SCAN_INT_TEST(131)
+BLOCK_SCAN_INT_TEST(189)
+BLOCK_SCAN_INT_TEST(200)
+BLOCK_SCAN_INT_TEST(256)
+BLOCK_SCAN_INT_TEST(300)
+BLOCK_SCAN_INT_TEST(311)
+BLOCK_SCAN_INT_TEST(489)
+BLOCK_SCAN_INT_TEST(500)
+BLOCK_SCAN_INT_TEST(512)
+BLOCK_SCAN_INT_TEST(600)
+BLOCK_SCAN_INT_TEST(611)
+BLOCK_SCAN_INT_TEST(1024)
+// BlockScanDouble
+template <int THREADS_NUM>
+__global__ void double_kernel(double* data)
+{
+    uint tid = Block::threadLineId();
+#if CV_CUDEV_ARCH >= 300
+    const int n_warps = (THREADS_NUM - 1) / WARP_SIZE + 1;
+    __shared__ double smem[n_warps];
+#else
+    __shared__ double smem[THREADS_NUM];
+#endif
+    data[tid] = blockScanInclusive<THREADS_NUM>(data[tid], smem, tid);
+}
+#define BLOCK_SCAN_DOUBLE_TEST(block_size)                                  \
+    TEST(BlockScanDouble, BlockSize##block_size)                            \
+    {                                                                       \
+        Mat src = randomMat(Size(block_size, 1), CV_64FC1, 0.0, 1.0);       \
+                                                                            \
+        GpuMat d_src;                                                       \
+        d_src.upload(src);                                                  \
+                                                                            \
+        for (int col = 1; col < block_size; col++)                          \
+            src.at<double>(0, col) += src.at<double>(0, col - 1);           \
+                                                                            \
+        double_kernel<block_size><<<1, block_size>>>((double*)d_src.data);  \
+                                                                            \
+        CV_CUDEV_SAFE_CALL(cudaDeviceSynchronize());                        \
+                                                                            \
+        EXPECT_MAT_NEAR(d_src, src, 1e-10);                                 \
+    }
+BLOCK_SCAN_DOUBLE_TEST(29)
+BLOCK_SCAN_DOUBLE_TEST(30)
+BLOCK_SCAN_DOUBLE_TEST(32)
+BLOCK_SCAN_DOUBLE_TEST(40)
+BLOCK_SCAN_DOUBLE_TEST(41)
+BLOCK_SCAN_DOUBLE_TEST(59)
+BLOCK_SCAN_DOUBLE_TEST(60)
+BLOCK_SCAN_DOUBLE_TEST(64)
+BLOCK_SCAN_DOUBLE_TEST(70)
+BLOCK_SCAN_DOUBLE_TEST(71)
+BLOCK_SCAN_DOUBLE_TEST(109)
+BLOCK_SCAN_DOUBLE_TEST(110)
+BLOCK_SCAN_DOUBLE_TEST(128)
+BLOCK_SCAN_DOUBLE_TEST(130)
+BLOCK_SCAN_DOUBLE_TEST(131)
+BLOCK_SCAN_DOUBLE_TEST(189)
+BLOCK_SCAN_DOUBLE_TEST(200)
+BLOCK_SCAN_DOUBLE_TEST(256)
+BLOCK_SCAN_DOUBLE_TEST(300)
+BLOCK_SCAN_DOUBLE_TEST(311)
+BLOCK_SCAN_DOUBLE_TEST(489)
+BLOCK_SCAN_DOUBLE_TEST(500)
+BLOCK_SCAN_DOUBLE_TEST(512)
+BLOCK_SCAN_DOUBLE_TEST(600)
+BLOCK_SCAN_DOUBLE_TEST(611)
+BLOCK_SCAN_DOUBLE_TEST(1024)