Commit 3da253a2 authored by Alexey Spizhevoy's avatar Alexey Spizhevoy

replaced one-threads tail reduce with one-block tail reduce in functions gpu: minMax, minMaxLoc

parent 9ebaaecc
...@@ -463,6 +463,25 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -463,6 +463,25 @@ namespace cv { namespace gpu { namespace mathfunc
} }
template <int size, typename T>
__device__ void find_min_max_in_smem(volatile T* minval, volatile T* maxval, const unsigned int tid)
{
if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); }
if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); } __syncthreads(); }
if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval); } __syncthreads(); }
if (tid < 32)
{
if (size >= 64) merge(tid, 32, minval, maxval);
if (size >= 32) merge(tid, 16, minval, maxval);
if (size >= 16) merge(tid, 8, minval, maxval);
if (size >= 8) merge(tid, 4, minval, maxval);
if (size >= 4) merge(tid, 2, minval, maxval);
if (size >= 2) merge(tid, 1, minval, maxval);
}
}
template <int nthreads, typename T> template <int nthreads, typename T>
__global__ void min_max_kernel(const DevMem2D src, T* minval, T* maxval) __global__ void min_max_kernel(const DevMem2D src, T* minval, T* maxval)
{ {
...@@ -490,22 +509,9 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -490,22 +509,9 @@ namespace cv { namespace gpu { namespace mathfunc
sminval[tid] = mymin; sminval[tid] = mymin;
smaxval[tid] = mymax; smaxval[tid] = mymax;
__syncthreads(); __syncthreads();
if (nthreads >= 512) if (tid < 256) { merge(tid, 256, sminval, smaxval); __syncthreads(); } find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);
if (nthreads >= 256) if (tid < 128) { merge(tid, 128, sminval, smaxval); __syncthreads(); }
if (nthreads >= 128) if (tid < 64) { merge(tid, 64, sminval, smaxval); __syncthreads(); }
if (tid < 32)
{
if (nthreads >= 64) merge(tid, 32, sminval, smaxval);
if (nthreads >= 32) merge(tid, 16, sminval, smaxval);
if (nthreads >= 16) merge(tid, 8, sminval, smaxval);
if (nthreads >= 8) merge(tid, 4, sminval, smaxval);
if (nthreads >= 4) merge(tid, 2, sminval, smaxval);
if (nthreads >= 2) merge(tid, 1, sminval, smaxval);
}
if (tid == 0) if (tid == 0)
{ {
...@@ -514,25 +520,42 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -514,25 +520,42 @@ namespace cv { namespace gpu { namespace mathfunc
} }
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
// Process partial results in the first thread of the last block if (tid == 0)
if ((gridDim.x > 1 || gridDim.y > 1) && tid == 0)
{ {
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
__threadfence(); __threadfence();
if (atomicInc(&blocks_finished, gridDim.x * gridDim.y) == gridDim.x * gridDim.y - 1)
unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = ticket == gridDim.x * gridDim.y - 1;
}
__syncthreads();
if (is_last)
{ {
mymin = minval[0]; unsigned int idx = min(tid, gridDim.x * gridDim.y - 1);
mymax = maxval[0];
for (unsigned int i = 1; i < gridDim.x * gridDim.y; ++i) sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx];
__syncthreads();
find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0)
{ {
mymin = min(mymin, minval[i]); minval[0] = (T)sminval[0];
mymax = max(mymax, maxval[i]); maxval[0] = (T)smaxval[0];
} }
minval[0] = mymin;
maxval[0] = mymax;
} }
#else
if (tid == 0)
{
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
} }
#endif #endif
} }
...@@ -568,19 +591,27 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -568,19 +591,27 @@ namespace cv { namespace gpu { namespace mathfunc
// This kernel will be used only when compute capability is 1.0 // This kernel will be used only when compute capability is 1.0
template <typename T> template <int nthreads, typename T>
__global__ void min_max_kernel_2ndstep(T* minval, T* maxval, int size) __global__ void min_max_kernel_2ndstep(T* minval, T* maxval, int size)
{ {
T val; typedef typename MinMaxTypeTraits<T>::best_type best_type;
T mymin = minval[0]; __shared__ best_type sminval[nthreads];
T mymax = maxval[0]; __shared__ best_type smaxval[nthreads];
for (unsigned int i = 1; i < size; ++i)
unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
unsigned int idx = min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx];
__syncthreads();
find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0)
{ {
val = minval[i]; if (val < mymin) mymin = val; minval[0] = (T)sminval[0];
val = maxval[i]; if (val > mymax) mymax = val; maxval[0] = (T)smaxval[0];
} }
minval[0] = mymin;
maxval[0] = mymax;
} }
...@@ -596,7 +627,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -596,7 +627,7 @@ namespace cv { namespace gpu { namespace mathfunc
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished))); cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
min_max_kernel<256, T><<<grid, threads>>>(src, minval_buf, maxval_buf); min_max_kernel<256, T><<<grid, threads>>>(src, minval_buf, maxval_buf);
min_max_kernel_2ndstep<T><<<1, 1>>>(minval_buf, maxval_buf, grid.x * grid.y); min_max_kernel_2ndstep<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_; T minval_, maxval_;
...@@ -680,6 +711,26 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -680,6 +711,26 @@ namespace cv { namespace gpu { namespace mathfunc
} }
template <int size, typename T>
__device__ void find_min_max_loc_in_smem(volatile T* minval, volatile T* maxval, volatile unsigned int* minloc,
volatile unsigned int* maxloc, const unsigned int tid)
{
if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); }
if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); } __syncthreads(); }
if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval, minloc, maxloc); } __syncthreads(); }
if (tid < 32)
{
if (size >= 64) merge(tid, 32, minval, maxval, minloc, maxloc);
if (size >= 32) merge(tid, 16, minval, maxval, minloc, maxloc);
if (size >= 16) merge(tid, 8, minval, maxval, minloc, maxloc);
if (size >= 8) merge(tid, 4, minval, maxval, minloc, maxloc);
if (size >= 4) merge(tid, 2, minval, maxval, minloc, maxloc);
if (size >= 2) merge(tid, 1, minval, maxval, minloc, maxloc);
}
}
template <int nthreads, typename T> template <int nthreads, typename T>
__global__ void min_max_loc_kernel(const DevMem2D src, T* minval, T* maxval, __global__ void min_max_loc_kernel(const DevMem2D src, T* minval, T* maxval,
unsigned int* minloc, unsigned int* maxloc) unsigned int* minloc, unsigned int* maxloc)
...@@ -720,22 +771,12 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -720,22 +771,12 @@ namespace cv { namespace gpu { namespace mathfunc
smaxval[tid] = mymax; smaxval[tid] = mymax;
sminloc[tid] = myminloc; sminloc[tid] = myminloc;
smaxloc[tid] = mymaxloc; smaxloc[tid] = mymaxloc;
__syncthreads(); __syncthreads();
if (nthreads >= 512) if (tid < 256) { merge(tid, 256, sminval, smaxval, sminloc, smaxloc); __syncthreads(); } find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
if (nthreads >= 256) if (tid < 128) { merge(tid, 128, sminval, smaxval, sminloc, smaxloc); __syncthreads(); }
if (nthreads >= 128) if (tid < 64) { merge(tid, 64, sminval, smaxval, sminloc, smaxloc); __syncthreads(); }
if (tid < 32) #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
{ __shared__ bool is_last;
if (nthreads >= 64) merge(tid, 32, sminval, smaxval, sminloc, smaxloc);
if (nthreads >= 32) merge(tid, 16, sminval, smaxval, sminloc, smaxloc);
if (nthreads >= 16) merge(tid, 8, sminval, smaxval, sminloc, smaxloc);
if (nthreads >= 8) merge(tid, 4, sminval, smaxval, sminloc, smaxloc);
if (nthreads >= 4) merge(tid, 2, sminval, smaxval, sminloc, smaxloc);
if (nthreads >= 2) merge(tid, 1, sminval, smaxval, sminloc, smaxloc);
}
if (tid == 0) if (tid == 0)
{ {
...@@ -743,29 +784,41 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -743,29 +784,41 @@ namespace cv { namespace gpu { namespace mathfunc
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0]; maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0]; minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];
maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0]; maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];
__threadfence();
unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = ticket == gridDim.x * gridDim.y - 1;
} }
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 __syncthreads();
// Process partial results in the first thread of the last block if (is_last)
if ((gridDim.x > 1 || gridDim.y > 1) && tid == 0)
{
__threadfence();
if (atomicInc(&blocks_finished, gridDim.x * gridDim.y) == gridDim.x * gridDim.y - 1)
{ {
mymin = minval[0]; unsigned int idx = min(tid, gridDim.x * gridDim.y - 1);
mymax = maxval[0];
unsigned int imin = 0, imax = 0; sminval[tid] = minval[idx];
for (unsigned int i = 1; i < gridDim.x * gridDim.y; ++i) smaxval[tid] = maxval[idx];
sminloc[tid] = minloc[idx];
smaxloc[tid] = maxloc[idx];
__syncthreads();
find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
if (tid == 0)
{ {
val = minval[i]; if (val < mymin) { mymin = val; imin = i; } minval[0] = (T)sminval[0];
val = maxval[i]; if (val > mymax) { mymax = val; imax = i; } maxval[0] = (T)smaxval[0];
minloc[0] = sminloc[0];
maxloc[0] = smaxloc[0];
} }
minval[0] = mymin;
maxval[0] = mymax;
minloc[0] = minloc[imin];
maxloc[0] = maxloc[imax];
} }
#else
if (tid == 0)
{
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];
maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];
} }
#endif #endif
} }
...@@ -811,22 +864,33 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -811,22 +864,33 @@ namespace cv { namespace gpu { namespace mathfunc
// This kernel will be used only when compute capability is 1.0 // This kernel will be used only when compute capability is 1.0
template <typename T> template <int nthreads, typename T>
__global__ void min_max_loc_kernel_2ndstep(T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, int size) __global__ void min_max_loc_kernel_2ndstep(T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, int size)
{ {
T val; typedef typename MinMaxTypeTraits<T>::best_type best_type;
T mymin = minval[0]; __shared__ best_type sminval[nthreads];
T mymax = maxval[0]; __shared__ best_type smaxval[nthreads];
unsigned int imin = 0, imax = 0; __shared__ unsigned int sminloc[nthreads];
for (unsigned int i = 1; i < size; ++i) __shared__ unsigned int smaxloc[nthreads];
unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
unsigned int idx = min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx];
sminloc[tid] = minloc[idx];
smaxloc[tid] = maxloc[idx];
__syncthreads();
find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
if (tid == 0)
{ {
val = minval[i]; if (val < mymin) { mymin = val; imin = i; } minval[0] = (T)sminval[0];
val = maxval[i]; if (val > mymax) { mymax = val; imax = i; } maxval[0] = (T)smaxval[0];
minloc[0] = sminloc[0];
maxloc[0] = smaxloc[0];
} }
minval[0] = mymin;
maxval[0] = mymax;
minloc[0] = minloc[imin];
maxloc[0] = maxloc[imax];
} }
...@@ -845,7 +909,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -845,7 +909,7 @@ namespace cv { namespace gpu { namespace mathfunc
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished))); cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
min_max_loc_kernel<256, T><<<grid, threads>>>(src, minval_buf, maxval_buf, minloc_buf, maxloc_buf); min_max_loc_kernel<256, T><<<grid, threads>>>(src, minval_buf, maxval_buf, minloc_buf, maxloc_buf);
min_max_loc_kernel_2ndstep<T><<<1, 1>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); min_max_loc_kernel_2ndstep<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_; T minval_, maxval_;
...@@ -909,13 +973,13 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -909,13 +973,13 @@ namespace cv { namespace gpu { namespace mathfunc
template <int size, typename T> template <int size, typename T>
__device__ void sum_shared_mem(volatile T* data, const unsigned int tid) __device__ void sum_is_smem(volatile T* data, const unsigned int tid)
{ {
T sum = data[tid]; T sum = data[tid];
if (size >= 512) if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); }
if (size >= 256) if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); }
if (size >= 128) if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); }
if (tid < 32) if (tid < 32)
{ {
...@@ -949,7 +1013,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -949,7 +1013,7 @@ namespace cv { namespace gpu { namespace mathfunc
scount[tid] = cnt; scount[tid] = cnt;
__syncthreads(); __syncthreads();
sum_shared_mem<nthreads, unsigned int>(scount, tid); sum_is_smem<nthreads, unsigned int>(scount, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last; __shared__ bool is_last;
...@@ -968,7 +1032,10 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -968,7 +1032,10 @@ namespace cv { namespace gpu { namespace mathfunc
if (is_last) if (is_last)
{ {
scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0; scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;
sum_shared_mem<nthreads, unsigned int>(scount, tid); __syncthreads();
sum_is_smem<nthreads, unsigned int>(scount, tid);
if (tid == 0) count[0] = scount[0]; if (tid == 0) count[0] = scount[0];
} }
#else #else
...@@ -1012,7 +1079,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1012,7 +1079,7 @@ namespace cv { namespace gpu { namespace mathfunc
unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
scount[tid] = tid < size ? count[tid] : 0; scount[tid] = tid < size ? count[tid] : 0;
sum_shared_mem<nthreads, unsigned int>(scount, tid); sum_is_smem<nthreads, unsigned int>(scount, tid);
if (tid == 0) count[0] = scount[0]; if (tid == 0) count[0] = scount[0];
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment