Commit debdc26b authored by Alexander Alekhin's avatar Alexander Alekhin

Merge moved code from opencv

parents 33f18dd6 02b3d77b
......@@ -157,43 +157,6 @@ CV_CUDEV_SHFL_VEC_INST(double)
// shfl_up
template <typename T>
__device__ __forceinline__ T compatible_shfl_up(T val, uint delta, int width = warpSize)
{
#if __CUDACC_VER_MAJOR__ < 9
return shfl_up(val, delta, width);
#else // __CUDACC_VER_MAJOR__ < 9
#if CV_CUDEV_ARCH >= 700
return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
#else
const int block_size = Block::blockSize();
const int residual = block_size & (warpSize - 1);
if (0 == residual)
return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
else
{
const int n_warps = divUp(block_size, warpSize);
const int warp_id = Warp::warpId();
if (warp_id < n_warps - 1)
return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
else
{
// We are at the last threads of a block whose number of threads
// is not a multiple of the warp size
uint mask = (1LU << residual) - 1;
return shfl_up_sync(mask, val, delta, width);
}
}
#endif
#endif // __CUDACC_VER_MAJOR__ < 9
}
#if __CUDACC_VER_MAJOR__ >= 9
template <typename T>
......@@ -300,6 +263,43 @@ CV_CUDEV_SHFL_UP_VEC_INST(double)
#endif
template <typename T>
__device__ __forceinline__ T compatible_shfl_up(T val, uint delta, int width = warpSize)
{
#if __CUDACC_VER_MAJOR__ < 9
return shfl_up(val, delta, width);
#else // __CUDACC_VER_MAJOR__ < 9
#if CV_CUDEV_ARCH >= 700
return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
#else
const int block_size = Block::blockSize();
const int residual = block_size & (warpSize - 1);
if (0 == residual)
return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
else
{
const int n_warps = divUp(block_size, warpSize);
const int warp_id = Warp::warpId();
if (warp_id < n_warps - 1)
return shfl_up_sync(0xFFFFFFFFU, val, delta, width);
else
{
// We are at the last threads of a block whose number of threads
// is not a multiple of the warp size
uint mask = (1LU << residual) - 1;
return shfl_up_sync(mask, val, delta, width);
}
}
#endif
#endif // __CUDACC_VER_MAJOR__ < 9
}
// shfl_down
__device__ __forceinline__ uchar shfl_down(uchar val, uint delta, int width = warpSize)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment