Merge moved code from opencv

f65010ea · Alexander Alekhin · ca7cb77a · cade1950 · f65010ea · f65010ea
Commit f65010ea authored Feb 26, 2019 by Alexander Alekhin
4 changed files
--- a/modules/cudev/include/opencv2/cudev/block/scan.hpp
+++ b/modules/cudev/include/opencv2/cudev/block/scan.hpp
@@ -135,6 +135,12 @@ __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
            }
            else
            {
+                // Read from smem[tid]              (T val = smem[tid])
+                // and write to smem[tid + 1]       (smem[tid + 1] = warpScanInclusive(mask, val))
+                // should be explicitly fenced by "__syncwarp" to get rid of
+                // "cuda-memcheck --tool racecheck" warnings.
+                __syncwarp(mask);
+
                // calculate inclusive scan and write back to shared memory with offset 1
                smem[tid + 1] = warpScanInclusive(mask, val);

@@ -197,10 +203,18 @@ __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)

        int quot = THREADS_NUM / WARP_SIZE;

+        T val;
+
        if (tid < quot)
        {
            // grab top warp elements
-            T val = smem[tid];
+            val = smem[tid];
+        }
+
+        __syncthreads();
+
+        if (tid < quot)
+        {

            if (0 == (THREADS_NUM & (WARP_SIZE - 1)))
            {

--- a/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
+++ b/modules/cudev/include/opencv2/cudev/grid/detail/integral.hpp
@@ -63,7 +63,8 @@ namespace integral_detail
        __shared__ D smem[NUM_SCAN_THREADS * 2];
        __shared__ D carryElem;

-        carryElem = 0;
+        if (threadIdx.x == 0)
+            carryElem = 0;

        __syncthreads();

@@ -105,7 +106,8 @@ namespace integral_detail
        __shared__ D smem[NUM_SCAN_THREADS * 2];
        __shared__ D carryElem;

-        carryElem = 0;
+        if (threadIdx.x == 0)
+            carryElem = 0;

        __syncthreads();


--- a/modules/cudev/include/opencv2/cudev/warp/scan.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/scan.hpp
@@ -98,7 +98,7 @@ __device__ T warpScanInclusive(T data, volatile T* smem, uint tid)
    #pragma unroll
    for (int i = 1; i <= (WARP_SIZE / 2); i *= 2)
    {
-        const T val = __shfl_up(data, i, WARP_SIZE);
+        const T val = shfl_up(data, i);
        if (laneId >= i)
              data += val;
    }

--- a/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
+++ b/modules/cudev/include/opencv2/cudev/warp/shuffle.hpp
@@ -250,6 +250,11 @@ __device__ double shfl_up(double val, uint delta, int width = warpSize)
    return __hiloint2double(hi, lo);
 }

+__device__ __forceinline__ unsigned long long shfl_up(unsigned long long val, uint delta, int width = warpSize)
+{
+    return __shfl_up(val, delta, width);
+}
+
 #define CV_CUDEV_SHFL_UP_VEC_INST(input_type) \
    __device__ __forceinline__ input_type ## 1 shfl_up(const input_type ## 1 & val, uint delta, int width = warpSize) \
    { \