Commit 63a022dc authored by Vladislav Vinogradov's avatar Vladislav Vinogradov

added explicit unroll to reduce implementation

parent 11c6eb63
...@@ -243,29 +243,46 @@ namespace cv { namespace gpu { namespace device ...@@ -243,29 +243,46 @@ namespace cv { namespace gpu { namespace device
} }
}; };
template <unsigned int I, typename Pointer, typename Reference, class Op>
struct Unroll
{
static __device__ void loopShfl(Reference val, Op op, unsigned int N)
{
mergeShfl(val, I, N, op);
Unroll<I / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
}
static __device__ void loop(Pointer smem, Reference val, unsigned int tid, Op op)
{
merge(smem, val, tid, I, op);
Unroll<I / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
}
};
template <typename Pointer, typename Reference, class Op>
struct Unroll<0, Pointer, Reference, Op>
{
static __device__ void loopShfl(Reference, Op, unsigned int)
{
}
static __device__ void loop(Pointer, Reference, unsigned int, Op)
{
}
};
template <unsigned int N> struct WarpOptimized template <unsigned int N> struct WarpOptimized
{ {
template <typename Pointer, typename Reference, class Op> template <typename Pointer, typename Reference, class Op>
static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op) static __device__ void reduce(Pointer smem, Reference val, unsigned int tid, Op op)
{ {
#if __CUDA_ARCH >= 300 #if __CUDA_ARCH__ >= 300
(void) smem; (void) smem;
(void) tid; (void) tid;
#pragma unroll Unroll<N / 2, Pointer, Reference, Op>::loopShfl(val, op, N);
for (unsigned int i = N / 2; i >= 1; i /= 2)
mergeShfl(val, i, N, op);
#else #else
loadToSmem(smem, val, tid); loadToSmem(smem, val, tid);
if (tid < N / 2) if (tid < N / 2)
{ Unroll<N / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
#if __CUDA_ARCH__ >= 200
#pragma unroll
#endif
for (unsigned int i = N / 2; i >= 1; i /= 2)
merge(smem, val, tid, i, op);
}
#endif #endif
} }
}; };
...@@ -279,10 +296,8 @@ namespace cv { namespace gpu { namespace device ...@@ -279,10 +296,8 @@ namespace cv { namespace gpu { namespace device
{ {
const unsigned int laneId = Warp::laneId(); const unsigned int laneId = Warp::laneId();
#if __CUDA_ARCH >= 300 #if __CUDA_ARCH__ >= 300
#pragma unroll Unroll<16, Pointer, Reference, Op>::loopShfl(val, op, warpSize);
for (int i = 16; i >= 1; i /= 2)
mergeShfl(val, i, warpSize, op);
if (laneId == 0) if (laneId == 0)
loadToSmem(smem, val, tid / 32); loadToSmem(smem, val, tid / 32);
...@@ -290,13 +305,7 @@ namespace cv { namespace gpu { namespace device ...@@ -290,13 +305,7 @@ namespace cv { namespace gpu { namespace device
loadToSmem(smem, val, tid); loadToSmem(smem, val, tid);
if (laneId < 16) if (laneId < 16)
{ Unroll<16, Pointer, Reference, Op>::loop(smem, val, tid, op);
#if __CUDA_ARCH__ >= 200
#pragma unroll
#endif
for (int i = 16; i >= 1; i /= 2)
merge(smem, val, tid, i, op);
}
__syncthreads(); __syncthreads();
...@@ -310,16 +319,10 @@ namespace cv { namespace gpu { namespace device ...@@ -310,16 +319,10 @@ namespace cv { namespace gpu { namespace device
if (tid < 32) if (tid < 32)
{ {
#if __CUDA_ARCH >= 300 #if __CUDA_ARCH__ >= 300
#pragma unroll Unroll<M / 2, Pointer, Reference, Op>::loopShfl(val, op, M);
for (int i = M / 2; i >= 1; i /= 2)
mergeShfl(val, i, M, op);
#else #else
#if __CUDA_ARCH__ >= 200 Unroll<M / 2, Pointer, Reference, Op>::loop(smem, val, tid, op);
#pragma unroll
#endif
for (int i = M / 2; i >= 1; i /= 2)
merge(smem, val, tid, i, op);
#endif #endif
} }
} }
......
...@@ -369,31 +369,48 @@ namespace cv { namespace gpu { namespace device ...@@ -369,31 +369,48 @@ namespace cv { namespace gpu { namespace device
} }
}; };
template <unsigned int I, class KP, class KR, class VP, class VR, class Cmp>
struct Unroll
{
static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N)
{
mergeShfl(key, val, cmp, I, N);
Unroll<I / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
}
static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
{
merge(skeys, key, svals, val, cmp, tid, I);
Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
}
};
template <class KP, class KR, class VP, class VR, class Cmp>
struct Unroll<0, KP, KR, VP, VR, Cmp>
{
static __device__ void loopShfl(KR, VR, Cmp, unsigned int)
{
}
static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp)
{
}
};
template <unsigned int N> struct WarpOptimized template <unsigned int N> struct WarpOptimized
{ {
template <class KP, class KR, class VP, class VR, class Cmp> template <class KP, class KR, class VP, class VR, class Cmp>
static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp) static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
{ {
#if __CUDA_ARCH >= 300 #if __CUDA_ARCH__ >= 300
(void) skeys; (void) skeys;
(void) svals; (void) svals;
(void) tid; (void) tid;
#pragma unroll Unroll<N / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
for (unsigned int i = N / 2; i >= 1; i /= 2)
mergeShfl(key, val, cml, i, N);
#else #else
loadToSmem(skeys, key, tid); loadToSmem(skeys, key, tid);
loadToSmem(svals, val, tid); loadToSmem(svals, val, tid);
if (tid < N / 2) if (tid < N / 2)
{ Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
#if __CUDA_ARCH__ >= 200
#pragma unroll
#endif
for (unsigned int i = N / 2; i >= 1; i /= 2)
merge(skeys, key, svals, val, cmp, tid, i);
}
#endif #endif
} }
}; };
...@@ -407,10 +424,8 @@ namespace cv { namespace gpu { namespace device ...@@ -407,10 +424,8 @@ namespace cv { namespace gpu { namespace device
{ {
const unsigned int laneId = Warp::laneId(); const unsigned int laneId = Warp::laneId();
#if __CUDA_ARCH >= 300 #if __CUDA_ARCH__ >= 300
#pragma unroll Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize);
for (unsigned int i = 16; i >= 1; i /= 2)
mergeShfl(key, val, cml, i, warpSize);
if (laneId == 0) if (laneId == 0)
{ {
...@@ -422,13 +437,7 @@ namespace cv { namespace gpu { namespace device ...@@ -422,13 +437,7 @@ namespace cv { namespace gpu { namespace device
loadToSmem(svals, val, tid); loadToSmem(svals, val, tid);
if (laneId < 16) if (laneId < 16)
{ Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
#if __CUDA_ARCH__ >= 200
#pragma unroll
#endif
for (int i = 16; i >= 1; i /= 2)
merge(skeys, key, svals, val, cmp, tid, i);
}
__syncthreads(); __syncthreads();
...@@ -445,18 +454,12 @@ namespace cv { namespace gpu { namespace device ...@@ -445,18 +454,12 @@ namespace cv { namespace gpu { namespace device
if (tid < 32) if (tid < 32)
{ {
#if __CUDA_ARCH >= 300 #if __CUDA_ARCH__ >= 300
loadFromSmem(svals, val, tid); loadFromSmem(svals, val, tid);
#pragma unroll Unroll<M / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, M);
for (unsigned int i = M / 2; i >= 1; i /= 2)
mergeShfl(key, val, cml, i, M);
#else #else
#if __CUDA_ARCH__ >= 200 Unroll<M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
#pragma unroll
#endif
for (unsigned int i = M / 2; i >= 1; i /= 2)
merge(skeys, key, svals, val, cmp, tid, i);
#endif #endif
} }
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment