Commit cade1950 authored by Namgoo Lee's avatar Namgoo Lee Committed by Namgoo Lee

[moved from opencv] cudev - Rework some code

- Use shfl_down, instead of __shfl_down, on warp scan
- Remove race conditions

original commit: https://github.com/opencv/opencv/commit/2b6be3cb0f937014a73e9dcaf38f611e35b19c6c
parent ca7cb77a
......@@ -135,6 +135,12 @@ __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
}
else
{
// Read from smem[tid] (T val = smem[tid])
// and write to smem[tid + 1] (smem[tid + 1] = warpScanInclusive(mask, val))
// should be explicitly fenced by "__syncwarp" to get rid of
// "cuda-memcheck --tool racecheck" warnings.
__syncwarp(mask);
// calculate inclusive scan and write back to shared memory with offset 1
smem[tid + 1] = warpScanInclusive(mask, val);
......@@ -197,10 +203,18 @@ __device__ T blockScanInclusive(T data, volatile T* smem, uint tid)
int quot = THREADS_NUM / WARP_SIZE;
T val;
if (tid < quot)
{
// grab top warp elements
T val = smem[tid];
val = smem[tid];
}
__syncthreads();
if (tid < quot)
{
if (0 == (THREADS_NUM & (WARP_SIZE - 1)))
{
......
......@@ -63,6 +63,7 @@ namespace integral_detail
__shared__ D smem[NUM_SCAN_THREADS * 2];
__shared__ D carryElem;
if (threadIdx.x == 0)
carryElem = 0;
__syncthreads();
......@@ -105,6 +106,7 @@ namespace integral_detail
__shared__ D smem[NUM_SCAN_THREADS * 2];
__shared__ D carryElem;
if (threadIdx.x == 0)
carryElem = 0;
__syncthreads();
......
......@@ -98,7 +98,7 @@ __device__ T warpScanInclusive(T data, volatile T* smem, uint tid)
#pragma unroll
for (int i = 1; i <= (WARP_SIZE / 2); i *= 2)
{
const T val = __shfl_up(data, i, WARP_SIZE);
const T val = shfl_up(data, i);
if (laneId >= i)
data += val;
}
......
......@@ -250,6 +250,11 @@ __device__ double shfl_up(double val, uint delta, int width = warpSize)
return __hiloint2double(hi, lo);
}
__device__ __forceinline__ unsigned long long shfl_up(unsigned long long val, uint delta, int width = warpSize)
{
return __shfl_up(val, delta, width);
}
#define CV_CUDEV_SHFL_UP_VEC_INST(input_type) \
__device__ __forceinline__ input_type ## 1 shfl_up(const input_type ## 1 & val, uint delta, int width = warpSize) \
{ \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment