Commit 8c48f3be authored by Alexey Spizhevoy's avatar Alexey Spizhevoy

moved GPU's global counter reset from caller to the kernel's end

parent 3da253a2
...@@ -419,10 +419,8 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -419,10 +419,8 @@ namespace cv { namespace gpu { namespace mathfunc
__constant__ int ctwidth; __constant__ int ctwidth;
__constant__ int ctheight; __constant__ int ctheight;
static const unsigned int czero = 0;
// Global counter of blocks finished its work // Global counter of blocks finished its work
__device__ unsigned int blocks_finished; __device__ unsigned int blocks_finished = 0;
// Estimates good thread configuration // Estimates good thread configuration
...@@ -548,6 +546,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -548,6 +546,7 @@ namespace cv { namespace gpu { namespace mathfunc
{ {
minval[0] = (T)sminval[0]; minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0]; maxval[0] = (T)smaxval[0];
blocks_finished = 0;
} }
} }
#else #else
...@@ -570,7 +569,6 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -570,7 +569,6 @@ namespace cv { namespace gpu { namespace mathfunc
T* minval_buf = (T*)buf.ptr(0); T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1); T* maxval_buf = (T*)buf.ptr(1);
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
min_max_kernel<256, T><<<grid, threads>>>(src, minval_buf, maxval_buf); min_max_kernel<256, T><<<grid, threads>>>(src, minval_buf, maxval_buf);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
...@@ -611,6 +609,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -611,6 +609,7 @@ namespace cv { namespace gpu { namespace mathfunc
{ {
minval[0] = (T)sminval[0]; minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0]; maxval[0] = (T)smaxval[0];
blocks_finished = 0;
} }
} }
...@@ -625,7 +624,6 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -625,7 +624,6 @@ namespace cv { namespace gpu { namespace mathfunc
T* minval_buf = (T*)buf.ptr(0); T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1); T* maxval_buf = (T*)buf.ptr(1);
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
min_max_kernel<256, T><<<grid, threads>>>(src, minval_buf, maxval_buf); min_max_kernel<256, T><<<grid, threads>>>(src, minval_buf, maxval_buf);
min_max_kernel_2ndstep<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y); min_max_kernel_2ndstep<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
...@@ -654,10 +652,8 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -654,10 +652,8 @@ namespace cv { namespace gpu { namespace mathfunc
__constant__ int ctwidth; __constant__ int ctwidth;
__constant__ int ctheight; __constant__ int ctheight;
static const unsigned int czero = 0;
// Global counter of blocks finished its work // Global counter of blocks finished its work
__device__ unsigned int blocks_finished; __device__ unsigned int blocks_finished = 0;
// Estimates good thread configuration // Estimates good thread configuration
...@@ -810,6 +806,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -810,6 +806,7 @@ namespace cv { namespace gpu { namespace mathfunc
maxval[0] = (T)smaxval[0]; maxval[0] = (T)smaxval[0];
minloc[0] = sminloc[0]; minloc[0] = sminloc[0];
maxloc[0] = smaxloc[0]; maxloc[0] = smaxloc[0];
blocks_finished = 0;
} }
} }
#else #else
...@@ -837,7 +834,6 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -837,7 +834,6 @@ namespace cv { namespace gpu { namespace mathfunc
unsigned int* minloc_buf = (unsigned int*)locbuf.ptr(0); unsigned int* minloc_buf = (unsigned int*)locbuf.ptr(0);
unsigned int* maxloc_buf = (unsigned int*)locbuf.ptr(1); unsigned int* maxloc_buf = (unsigned int*)locbuf.ptr(1);
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
min_max_loc_kernel<256, T><<<grid, threads>>>(src, minval_buf, maxval_buf, minloc_buf, maxloc_buf); min_max_loc_kernel<256, T><<<grid, threads>>>(src, minval_buf, maxval_buf, minloc_buf, maxloc_buf);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
...@@ -890,6 +886,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -890,6 +886,7 @@ namespace cv { namespace gpu { namespace mathfunc
maxval[0] = (T)smaxval[0]; maxval[0] = (T)smaxval[0];
minloc[0] = sminloc[0]; minloc[0] = sminloc[0];
maxloc[0] = smaxloc[0]; maxloc[0] = smaxloc[0];
blocks_finished = 0;
} }
} }
...@@ -907,7 +904,6 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -907,7 +904,6 @@ namespace cv { namespace gpu { namespace mathfunc
unsigned int* minloc_buf = (unsigned int*)locbuf.ptr(0); unsigned int* minloc_buf = (unsigned int*)locbuf.ptr(0);
unsigned int* maxloc_buf = (unsigned int*)locbuf.ptr(1); unsigned int* maxloc_buf = (unsigned int*)locbuf.ptr(1);
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
min_max_loc_kernel<256, T><<<grid, threads>>>(src, minval_buf, maxval_buf, minloc_buf, maxloc_buf); min_max_loc_kernel<256, T><<<grid, threads>>>(src, minval_buf, maxval_buf, minloc_buf, maxloc_buf);
min_max_loc_kernel_2ndstep<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y); min_max_loc_kernel_2ndstep<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
...@@ -943,9 +939,7 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -943,9 +939,7 @@ namespace cv { namespace gpu { namespace mathfunc
__constant__ int ctwidth; __constant__ int ctwidth;
__constant__ int ctheight; __constant__ int ctheight;
static const unsigned int czero = 0; __device__ unsigned int blocks_finished = 0;
__device__ unsigned int blocks_finished;
void estimate_thread_cfg(dim3& threads, dim3& grid) void estimate_thread_cfg(dim3& threads, dim3& grid)
{ {
...@@ -1036,7 +1030,11 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1036,7 +1030,11 @@ namespace cv { namespace gpu { namespace mathfunc
sum_is_smem<nthreads, unsigned int>(scount, tid); sum_is_smem<nthreads, unsigned int>(scount, tid);
if (tid == 0) count[0] = scount[0]; if (tid == 0)
{
count[0] = scount[0];
blocks_finished = 0;
}
} }
#else #else
if (tid == 0) count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0]; if (tid == 0) count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
...@@ -1053,7 +1051,6 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1053,7 +1051,6 @@ namespace cv { namespace gpu { namespace mathfunc
unsigned int* count_buf = (unsigned int*)buf.ptr(0); unsigned int* count_buf = (unsigned int*)buf.ptr(0);
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf); count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
...@@ -1081,7 +1078,11 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1081,7 +1078,11 @@ namespace cv { namespace gpu { namespace mathfunc
scount[tid] = tid < size ? count[tid] : 0; scount[tid] = tid < size ? count[tid] : 0;
sum_is_smem<nthreads, unsigned int>(scount, tid); sum_is_smem<nthreads, unsigned int>(scount, tid);
if (tid == 0) count[0] = scount[0]; if (tid == 0)
{
count[0] = scount[0];
blocks_finished = 0;
}
} }
...@@ -1094,7 +1095,6 @@ namespace cv { namespace gpu { namespace mathfunc ...@@ -1094,7 +1095,6 @@ namespace cv { namespace gpu { namespace mathfunc
unsigned int* count_buf = (unsigned int*)buf.ptr(0); unsigned int* count_buf = (unsigned int*)buf.ptr(0);
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf); count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
count_non_zero_kernel_2ndstep<256, T><<<1, 256>>>(count_buf, grid.x * grid.y); count_non_zero_kernel_2ndstep<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize()); cudaSafeCall(cudaThreadSynchronize());
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment