Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
9f80317f
Commit
9f80317f
authored
Dec 10, 2010
by
Alexey Spizhevoy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fixed minor bug in gpu module, added first version of sum
parent
d557c800
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
179 additions
and
20 deletions
+179
-20
imgproc.cu
modules/gpu/src/cuda/imgproc.cu
+2
-2
mathfunc.cu
modules/gpu/src/cuda/mathfunc.cu
+177
-7
match_template.cpp
modules/gpu/src/match_template.cpp
+0
-11
No files found.
modules/gpu/src/cuda/imgproc.cu
View file @
9f80317f
...
@@ -719,7 +719,7 @@ namespace cv { namespace gpu { namespace imgproc
...
@@ -719,7 +719,7 @@ namespace cv { namespace gpu { namespace imgproc
////////////////////////////// Column Sum //////////////////////////////////////
////////////////////////////// Column Sum //////////////////////////////////////
__global__ void column
SumK
ernel_32F(int cols, int rows, const PtrStep src, const PtrStep dst)
__global__ void column
_sum_k
ernel_32F(int cols, int rows, const PtrStep src, const PtrStep dst)
{
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int x = blockIdx.x * blockDim.x + threadIdx.x;
...
@@ -745,7 +745,7 @@ namespace cv { namespace gpu { namespace imgproc
...
@@ -745,7 +745,7 @@ namespace cv { namespace gpu { namespace imgproc
dim3 threads(256);
dim3 threads(256);
dim3 grid(divUp(src.cols, threads.x));
dim3 grid(divUp(src.cols, threads.x));
column
SumK
ernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
column
_sum_k
ernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaThreadSynchronize());
}
}
...
...
modules/gpu/src/cuda/mathfunc.cu
View file @
9f80317f
...
@@ -450,6 +450,8 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -450,6 +450,8 @@ namespace cv { namespace gpu { namespace mathfunc
{
{
threads = dim3(32, 8);
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid.x = min(grid.x, threads.x);
grid.y = min(grid.y, threads.y);
}
}
...
@@ -662,7 +664,6 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -662,7 +664,6 @@ namespace cv { namespace gpu { namespace mathfunc
{
{
minval[0] = (T)sminval[0];
minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0];
maxval[0] = (T)smaxval[0];
blocks_finished = 0;
}
}
}
}
...
@@ -744,6 +745,8 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -744,6 +745,8 @@ namespace cv { namespace gpu { namespace mathfunc
{
{
threads = dim3(32, 8);
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid.x = min(grid.x, threads.x);
grid.y = min(grid.y, threads.y);
}
}
...
@@ -1005,7 +1008,6 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1005,7 +1008,6 @@ namespace cv { namespace gpu { namespace mathfunc
maxval[0] = (T)smaxval[0];
maxval[0] = (T)smaxval[0];
minloc[0] = sminloc[0];
minloc[0] = sminloc[0];
maxloc[0] = smaxloc[0];
maxloc[0] = smaxloc[0];
blocks_finished = 0;
}
}
}
}
...
@@ -1102,6 +1104,8 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1102,6 +1104,8 @@ namespace cv { namespace gpu { namespace mathfunc
{
{
threads = dim3(32, 8);
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid.x = min(grid.x, threads.x);
grid.y = min(grid.y, threads.y);
}
}
...
@@ -1212,13 +1216,12 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1212,13 +1216,12 @@ namespace cv { namespace gpu { namespace mathfunc
unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
scount[tid] = tid < size ? count[tid] : 0;
scount[tid] = tid < size ? count[tid] : 0;
sum_in_smem<nthreads, unsigned int>(scount, tid
);
__syncthreads(
);
if (tid == 0)
sum_in_smem<nthreads, unsigned int>(scount, tid);
{
if (tid == 0)
count[0] = scount[0];
count[0] = scount[0];
blocks_finished = 0;
}
}
}
...
@@ -1409,4 +1412,171 @@ namespace cv { namespace gpu { namespace mathfunc
...
@@ -1409,4 +1412,171 @@ namespace cv { namespace gpu { namespace mathfunc
template void max_gpu<int >(const DevMem2D_<int>& src1, double src2, const DevMem2D_<int>& dst, cudaStream_t stream);
template void max_gpu<int >(const DevMem2D_<int>& src1, double src2, const DevMem2D_<int>& dst, cudaStream_t stream);
template void max_gpu<float >(const DevMem2D_<float>& src1, double src2, const DevMem2D_<float>& dst, cudaStream_t stream);
template void max_gpu<float >(const DevMem2D_<float>& src1, double src2, const DevMem2D_<float>& dst, cudaStream_t stream);
template void max_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
template void max_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
//////////////////////////////////////////////////////////////////////////////
// Sum
namespace sum
{
__constant__ int ctwidth;
__constant__ int ctheight;
__device__ unsigned int blocks_finished = 0;
const int threads_x = 32;
const int threads_y = 8;
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, threads.x * threads.y),
divUp(rows, threads.y * threads.x));
grid.x = min(grid.x, threads.x);
grid.y = min(grid.y, threads.y);
}
template <typename T>
void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows)
{
dim3 threads, grid;
estimate_thread_cfg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * sizeof(T);
bufrows = 1;
}
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
}
template <typename T, int nthreads>
__global__ void sum_kernel(const DevMem2D_<T> src, T* result)
{
__shared__ T smem[nthreads];
const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
T sum = 0;
for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const T* ptr = src.ptr(y0 + y * blockDim.y);
for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
sum += ptr[x0 + x * blockDim.x];
}
smem[tid] = sum;
__syncthreads();
sum_in_smem<nthreads, T>(smem, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
result[bid] = smem[0];
__threadfence();
unsigned int ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0;
__syncthreads();
sum_in_smem<nthreads, T>(smem, tid);
if (tid == 0)
{
result[0] = smem[0];
blocks_finished = 0;
}
}
#else
if (tid == 0) result[bid] = smem[0];
#endif
}
template <typename T>
T sum_caller(const DevMem2D_<T> src, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* buf_ = (T*)buf.ptr(0);
sum_kernel<T, threads_x * threads_y><<<grid, threads>>>(src, buf_);
cudaSafeCall(cudaThreadSynchronize());
T sum;
cudaSafeCall(cudaMemcpy(&sum, buf_, sizeof(T), cudaMemcpyDeviceToHost));
return sum;
}
template unsigned char sum_caller<unsigned char>(const DevMem2D_<unsigned char>, PtrStep);
template char sum_caller<char>(const DevMem2D_<char>, PtrStep);
template unsigned short sum_caller<unsigned short>(const DevMem2D_<unsigned short>, PtrStep);
template short sum_caller<short>(const DevMem2D_<short>, PtrStep);
template int sum_caller<int>(const DevMem2D_<int>, PtrStep);
template float sum_caller<float>(const DevMem2D_<float>, PtrStep);
template double sum_caller<double>(const DevMem2D_<double>, PtrStep);
template <typename T, int nthreads>
__global__ void sum_pass2_kernel(T* result, int size)
{
__shared__ T smem[nthreads];
int tid = threadIdx.y * blockDim.x + threadIdx.x;
smem[tid] = tid < size ? result[tid] : 0;
sum_in_smem<nthreads, T>(smem, tid);
if (tid == 0)
result[0] = smem[0];
}
template <typename T>
T sum_multipass_caller(const DevMem2D_<T> src, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* buf_ = (T*)buf.ptr(0);
sum_kernel<T, threads_x * threads_y><<<grid, threads>>>(src, buf_);
sum_pass2_kernel<T, threads_x * threads_y><<<1, threads_x * threads_y>>>(
buf_, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
T sum;
cudaSafeCall(cudaMemcpy(&sum, buf_, sizeof(T), cudaMemcpyDeviceToHost));
return sum;
}
template unsigned char sum_multipass_caller<unsigned char>(const DevMem2D_<unsigned char>, PtrStep);
template char sum_multipass_caller<char>(const DevMem2D_<char>, PtrStep);
template unsigned short sum_multipass_caller<unsigned short>(const DevMem2D_<unsigned short>, PtrStep);
template short sum_multipass_caller<short>(const DevMem2D_<short>, PtrStep);
template int sum_multipass_caller<int>(const DevMem2D_<int>, PtrStep);
template float sum_multipass_caller<float>(const DevMem2D_<float>, PtrStep);
} // namespace sum
}}}
}}}
modules/gpu/src/match_template.cpp
View file @
9f80317f
...
@@ -244,17 +244,6 @@ namespace
...
@@ -244,17 +244,6 @@ namespace
{
{
result
.
create
(
image
.
rows
-
templ
.
rows
+
1
,
image
.
cols
-
templ
.
cols
+
1
,
CV_32F
);
result
.
create
(
image
.
rows
-
templ
.
rows
+
1
,
image
.
cols
-
templ
.
cols
+
1
,
CV_32F
);
imgproc
::
matchTemplateNaive_8U_SQDIFF
(
image
,
templ
,
result
);
imgproc
::
matchTemplateNaive_8U_SQDIFF
(
image
,
templ
,
result
);
//GpuMat image_sum;
//GpuMat image_sumsq;
//integral(image, image_sum, image_sumsq);
//float templ_sumsq = 0.f;
//matchTemplate_8U_CCORR(image, templ, result);
//imgproc::matchTemplatePrepared_8U_SQDIFF(
// templ.cols, templ.rows, image_sumsq, templ_sumsq, result);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment