Commit 09269b4c authored by Marina Kolpakova's avatar Marina Kolpakova

fixed backward compatibility with less than 1.2 CUDA capability

parent 3ac6272c
......@@ -86,8 +86,11 @@ namespace cv { namespace gpu { namespace device
rect.y = roundf(y * scale);
rect.z = roundf(clWidth);
rect.w = roundf(clHeight);
int res = atomicInc(n, 100);
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
int res = __atomicInc(n, 100U);
#else
int res = atomicInc(n, 100U);
#endif
objects(0, res) = rect;
}
......@@ -111,14 +114,24 @@ namespace cv { namespace gpu { namespace device
__syncthreads();
int cls = labels[tid];
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
__atomicAdd((int*)(rrects + cls * 4 + 0), candidates[tid].x);
__atomicAdd((int*)(rrects + cls * 4 + 1), candidates[tid].y);
__atomicAdd((int*)(rrects + cls * 4 + 2), candidates[tid].z);
__atomicAdd((int*)(rrects + cls * 4 + 3), candidates[tid].w);
#else
atomicAdd((int*)(rrects + cls * 4 + 0), candidates[tid].x);
atomicAdd((int*)(rrects + cls * 4 + 1), candidates[tid].y);
atomicAdd((int*)(rrects + cls * 4 + 2), candidates[tid].z);
atomicAdd((int*)(rrects + cls * 4 + 3), candidates[tid].w);
#endif
labels[tid] = 0;
__syncthreads();
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
__atomicInc((unsigned int*)labels + cls, n);
#else
atomicInc((unsigned int*)labels + cls, n);
#endif
*nclasses = 0;
int active = labels[tid];
......@@ -154,7 +167,11 @@ namespace cv { namespace gpu { namespace device
}
if( j == n)
{
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
objects[__atomicInc(nclasses, n)] = VecTraits<int4>::make(r1[0], r1[1], r1[2], r1[3]);
#else
objects[atomicInc(nclasses, n)] = VecTraits<int4>::make(r1[0], r1[1], r1[2], r1[3]);
#endif
}
}
}
......
......@@ -48,6 +48,46 @@
namespace cv { namespace gpu { namespace device {
namespace lbp{
#define TAG_MASK ( (1U << ( (sizeof(unsigned int) << 3) - 5U)) - 1U )
template<typename T>
__device__ __forceinline__ T __atomicInc(T* address, T val)
{
T count;
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
do
{
count = *address & TAG_MASK;
count = tag | (count + 1);
*address = count;
} while (*address != count);
return (count & TAG_MASK) - 1;
}
template<typename T>
__device__ __forceinline__ void __atomicAdd(T* address, T val)
{
T count;
unsigned int tag = threadIdx.x << ( (sizeof(unsigned int) << 3) - 5U);
do
{
count = *address & TAG_MASK;
count = tag | (count + val);
*address = count;
} while (*address != count);
}
template<typename T>
__device__ __forceinline__ T __atomicMin(T* address, T val)
{
T count = min(*address, val);
do
{
*address = count;
} while (*address > count);
return count;
}
struct Stage
{
int first;
......@@ -94,11 +134,19 @@ namespace lbp{
if (p < q)
{
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
__atomicMin(labels + id, p);
#else
atomicMin(labels + id, p);
#endif
}
else if (p > q)
{
#if defined (__CUDA_ARCH__) && (__CUDA_ARCH__ < 120)
__atomicMin(labels + tid, q);
#else
atomicMin(labels + tid, q);
#endif
}
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment