Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
d557c800
Commit
d557c800
authored
Dec 10, 2010
by
Alexey Spizhevoy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
refactored gpu module
parent
97484089
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
141 additions
and
75 deletions
+141
-75
gpu.hpp
modules/gpu/include/opencv2/gpu/gpu.hpp
+1
-1
match_template.cu
modules/gpu/src/cuda/match_template.cu
+46
-10
mathfunc.cu
modules/gpu/src/cuda/mathfunc.cu
+23
-24
imgproc_gpu.cpp
modules/gpu/src/imgproc_gpu.cpp
+3
-3
match_template.cpp
modules/gpu/src/match_template.cpp
+42
-28
match_template.cpp
tests/gpu/src/match_template.cpp
+26
-9
No files found.
modules/gpu/include/opencv2/gpu/gpu.hpp
View file @
d557c800
...
...
@@ -636,7 +636,7 @@ namespace cv
//! computes the integral image and integral for the squared image
//! sum will have CV_32S type, sqsum - CV32F type
//! supports only CV_8UC1 source type
CV_EXPORTS
void
integral
(
GpuMat
&
src
,
GpuMat
&
sum
,
GpuMat
&
sqsum
);
CV_EXPORTS
void
integral
(
const
GpuMat
&
src
,
GpuMat
&
sum
,
GpuMat
&
sqsum
);
//! computes vertical sum, supports only CV_32FC1 images
CV_EXPORTS
void
columnSum
(
const
GpuMat
&
src
,
GpuMat
&
sum
);
...
...
modules/gpu/src/cuda/match_template.cu
View file @
d557c800
...
...
@@ -55,7 +55,8 @@ texture<unsigned char, 2> imageTex_8U;
texture<unsigned char, 2> templTex_8U;
__global__ void matchTemplateNaiveKernel_8U_SQDIFF(int w, int h, DevMem2Df result)
__global__ void matchTemplateNaiveKernel_8U_SQDIFF(int w, int h,
DevMem2Df result)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
...
...
@@ -80,11 +81,12 @@ __global__ void matchTemplateNaiveKernel_8U_SQDIFF(int w, int h, DevMem2Df resul
}
void matchTemplateNaive_8U_SQDIFF(const DevMem2D image, const DevMem2D templ, DevMem2Df result)
void matchTemplateNaive_8U_SQDIFF(const DevMem2D image, const DevMem2D templ,
DevMem2Df result)
{
dim3 threads(32, 8);
dim3 grid(divUp(image.cols - templ.cols + 1, threads.x),
divUp(image.rows - templ.rows + 1, threads.y));
divUp(image.rows - templ.rows + 1, threads.y));
cudaChannelFormatDesc desc = cudaCreateChannelDesc<unsigned char>();
cudaBindTexture2D(0, imageTex_8U, image.data, desc, image.cols, image.rows, image.step);
...
...
@@ -103,7 +105,8 @@ texture<float, 2> imageTex_32F;
texture<float, 2> templTex_32F;
__global__ void matchTemplateNaiveKernel_32F_SQDIFF(int w, int h, DevMem2Df result)
__global__ void matchTemplateNaiveKernel_32F_SQDIFF(int w, int h,
DevMem2Df result)
{
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
...
...
@@ -128,11 +131,12 @@ __global__ void matchTemplateNaiveKernel_32F_SQDIFF(int w, int h, DevMem2Df resu
}
void matchTemplateNaive_32F_SQDIFF(const DevMem2D image, const DevMem2D templ, DevMem2Df result)
void matchTemplateNaive_32F_SQDIFF(const DevMem2D image, const DevMem2D templ,
DevMem2Df result)
{
dim3 threads(32, 8);
dim3 grid(divUp(image.cols - templ.cols + 1, threads.x),
divUp(image.rows - templ.rows + 1, threads.y));
divUp(image.rows - templ.rows + 1, threads.y));
cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
cudaBindTexture2D(0, imageTex_32F, image.data, desc, image.cols, image.rows, image.step);
...
...
@@ -147,8 +151,9 @@ void matchTemplateNaive_32F_SQDIFF(const DevMem2D image, const DevMem2D templ, D
}
__global__ void multiplyAndNormalizeSpectsKernel(int n, float scale, const cufftComplex* a,
const cufftComplex* b, cufftComplex* c)
__global__ void multiplyAndNormalizeSpectsKernel(
int n, float scale, const cufftComplex* a,
const cufftComplex* b, cufftComplex* c)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
if (x < n)
...
...
@@ -159,8 +164,8 @@ __global__ void multiplyAndNormalizeSpectsKernel(int n, float scale, const cufft
}
void multiplyAndNormalizeSpects(int n, float scale, const cufftComplex* a,
const cufftComplex* b,
cufftComplex* c)
void multiplyAndNormalizeSpects(int n, float scale, const cufftComplex* a,
c
onst cufftComplex* b, c
ufftComplex* c)
{
dim3 threads(256);
dim3 grid(divUp(n, threads.x));
...
...
@@ -169,4 +174,35 @@ void multiplyAndNormalizeSpects(int n, float scale, const cufftComplex* a, const
}
__global__ void matchTemplatePreparedKernel_8U_SQDIFF(
int w, int h, const PtrStepf image_sumsq, float templ_sumsq,
DevMem2Df result)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < result.cols && y < result.rows)
{
float image_sq = image_sumsq.ptr(y + h)[x + w]
- image_sumsq.ptr(y)[x + w]
- image_sumsq.ptr(y + h)[x]
+ image_sumsq.ptr(y)[x];
float ccorr = result.ptr(y)[x];
result.ptr(y)[x] = image_sq - 2.f * ccorr + templ_sumsq;
}
}
void matchTemplatePrepared_8U_SQDIFF(
int w, int h, const DevMem2Df image_sumsq, float templ_sumsq,
DevMem2Df result)
{
dim3 threads(32, 8);
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
matchTemplatePreparedKernel_8U_SQDIFF<<<grid, threads>>>(
w, h, image_sumsq, templ_sumsq, result);
cudaSafeCall(cudaThreadSynchronize());
}
}}}
modules/gpu/src/cuda/mathfunc.cu
View file @
d557c800
...
...
@@ -57,6 +57,26 @@ using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace mathfunc
{
template <int size, typename T>
__device__ void sum_in_smem(volatile T* data, const unsigned int tid)
{
T sum = data[tid];
if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); }
if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); }
if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); }
if (tid < 32)
{
if (size >= 64) data[tid] = sum = sum + data[tid + 32];
if (size >= 32) data[tid] = sum = sum + data[tid + 16];
if (size >= 16) data[tid] = sum = sum + data[tid + 8];
if (size >= 8) data[tid] = sum = sum + data[tid + 4];
if (size >= 4) data[tid] = sum = sum + data[tid + 2];
if (size >= 2) data[tid] = sum = sum + data[tid + 1];
}
}
struct Nothing
{
static __device__ void calc(int, int, float, float, float*, size_t, float)
...
...
@@ -1103,27 +1123,6 @@ namespace cv { namespace gpu { namespace mathfunc
}
template <int size, typename T>
__device__ void sum_is_smem(volatile T* data, const unsigned int tid)
{
T sum = data[tid];
if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); }
if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); }
if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); }
if (tid < 32)
{
if (size >= 64) data[tid] = sum = sum + data[tid + 32];
if (size >= 32) data[tid] = sum = sum + data[tid + 16];
if (size >= 16) data[tid] = sum = sum + data[tid + 8];
if (size >= 8) data[tid] = sum = sum + data[tid + 4];
if (size >= 4) data[tid] = sum = sum + data[tid + 2];
if (size >= 2) data[tid] = sum = sum + data[tid + 1];
}
}
template <int nthreads, typename T>
__global__ void count_non_zero_kernel(const DevMem2D src, volatile unsigned int* count)
{
...
...
@@ -1144,7 +1143,7 @@ namespace cv { namespace gpu { namespace mathfunc
scount[tid] = cnt;
__syncthreads();
sum_i
s
_smem<nthreads, unsigned int>(scount, tid);
sum_i
n
_smem<nthreads, unsigned int>(scount, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
...
...
@@ -1165,7 +1164,7 @@ namespace cv { namespace gpu { namespace mathfunc
scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;
__syncthreads();
sum_i
s
_smem<nthreads, unsigned int>(scount, tid);
sum_i
n
_smem<nthreads, unsigned int>(scount, tid);
if (tid == 0)
{
...
...
@@ -1213,7 +1212,7 @@ namespace cv { namespace gpu { namespace mathfunc
unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
scount[tid] = tid < size ? count[tid] : 0;
sum_i
s
_smem<nthreads, unsigned int>(scount, tid);
sum_i
n
_smem<nthreads, unsigned int>(scount, tid);
if (tid == 0)
{
...
...
modules/gpu/src/imgproc_gpu.cpp
View file @
d557c800
...
...
@@ -60,7 +60,7 @@ void cv::gpu::copyMakeBorder(const GpuMat&, GpuMat&, int, int, int, int, const S
void
cv
::
gpu
::
warpAffine
(
const
GpuMat
&
,
GpuMat
&
,
const
Mat
&
,
Size
,
int
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
warpPerspective
(
const
GpuMat
&
,
GpuMat
&
,
const
Mat
&
,
Size
,
int
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
rotate
(
const
GpuMat
&
,
GpuMat
&
,
Size
,
double
,
double
,
double
,
int
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
integral
(
GpuMat
&
,
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
integral
(
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
columnSum
(
const
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
rectStdDev
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
const
Rect
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
Canny
(
const
GpuMat
&
,
GpuMat
&
,
double
,
double
,
int
)
{
throw_nogpu
();
}
...
...
@@ -539,7 +539,7 @@ void cv::gpu::rotate(const GpuMat& src, GpuMat& dst, Size dsize, double angle, d
////////////////////////////////////////////////////////////////////////
// integral
void
cv
::
gpu
::
integral
(
GpuMat
&
src
,
GpuMat
&
sum
,
GpuMat
&
sqsum
)
void
cv
::
gpu
::
integral
(
const
GpuMat
&
src
,
GpuMat
&
sum
,
GpuMat
&
sqsum
)
{
CV_Assert
(
src
.
type
()
==
CV_8UC1
);
...
...
@@ -552,7 +552,7 @@ void cv::gpu::integral(GpuMat& src, GpuMat& sum, GpuMat& sqsum)
sz
.
width
=
src
.
cols
;
sz
.
height
=
src
.
rows
;
nppSafeCall
(
nppiSqrIntegral_8u32s32f_C1R
(
src
.
ptr
<
Npp8u
>
(
),
src
.
step
,
sum
.
ptr
<
Npp32s
>
(),
nppSafeCall
(
nppiSqrIntegral_8u32s32f_C1R
(
const_cast
<
Npp8u
*>
(
src
.
ptr
<
Npp8u
>
()
),
src
.
step
,
sum
.
ptr
<
Npp32s
>
(),
sum
.
step
,
sqsum
.
ptr
<
Npp32f
>
(),
sqsum
.
step
,
sz
,
0
,
0.0
f
,
h
)
);
}
...
...
modules/gpu/src/match_template.cpp
View file @
d557c800
...
...
@@ -59,18 +59,27 @@ void cv::gpu::matchTemplate(const GpuMat&, const GpuMat&, GpuMat&, int) { throw_
namespace
cv
{
namespace
gpu
{
namespace
imgproc
{
void
multiplyAndNormalizeSpects
(
int
n
,
float
scale
,
const
cufftComplex
*
a
,
void
multiplyAndNormalizeSpects
(
int
n
,
float
scale
,
const
cufftComplex
*
a
,
const
cufftComplex
*
b
,
cufftComplex
*
c
);
void
matchTemplateNaive_8U_SQDIFF
(
const
DevMem2D
image
,
const
DevMem2D
templ
,
DevMem2Df
result
);
void
matchTemplateNaive_32F_SQDIFF
(
const
DevMem2D
image
,
const
DevMem2D
templ
,
DevMem2Df
result
);
void
matchTemplateNaive_8U_SQDIFF
(
const
DevMem2D
image
,
const
DevMem2D
templ
,
DevMem2Df
result
);
void
matchTemplateNaive_32F_SQDIFF
(
const
DevMem2D
image
,
const
DevMem2D
templ
,
DevMem2Df
result
);
void
matchTemplatePrepared_8U_SQDIFF
(
int
w
,
int
h
,
const
DevMem2Df
image_sumsq
,
float
templ_sumsq
,
DevMem2Df
result
);
}}}
namespace
{
template
<
int
type
,
int
method
>
void
matchTemplate
(
const
GpuMat
&
image
,
const
GpuMat
&
templ
,
GpuMat
&
result
);
void
matchTemplate_32F_SQDIFF
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
);
void
matchTemplate_32F_CCORR
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
);
void
matchTemplate_8U_SQDIFF
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
);
void
matchTemplate_8U_CCORR
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
);
#ifdef BLOCK_VERSION
...
...
@@ -86,8 +95,7 @@ namespace
}
#endif
template
<>
void
matchTemplate
<
CV_32F
,
CV_TM_SQDIFF
>
(
const
GpuMat
&
image
,
const
GpuMat
&
templ
,
GpuMat
&
result
)
void
matchTemplate_32F_SQDIFF
(
const
GpuMat
&
image
,
const
GpuMat
&
templ
,
GpuMat
&
result
)
{
result
.
create
(
image
.
rows
-
templ
.
rows
+
1
,
image
.
cols
-
templ
.
cols
+
1
,
CV_32F
);
imgproc
::
matchTemplateNaive_32F_SQDIFF
(
image
,
templ
,
result
);
...
...
@@ -95,8 +103,7 @@ namespace
#ifdef BLOCK_VERSION
template
<>
void
matchTemplate
<
CV_32F
,
CV_TM_CCORR
>
(
const
GpuMat
&
image
,
const
GpuMat
&
templ
,
GpuMat
&
result
)
void
matchTemplate_32F_CCORR
(
const
GpuMat
&
image
,
const
GpuMat
&
templ
,
GpuMat
&
result
)
{
result
.
create
(
image
.
rows
-
templ
.
rows
+
1
,
image
.
cols
-
templ
.
cols
+
1
,
CV_32F
);
...
...
@@ -174,8 +181,7 @@ namespace
cudaFree
(
result_data
);
}
#else
template
<>
void
matchTemplate
<
CV_32F
,
CV_TM_CCORR
>
(
const
GpuMat
&
image
,
const
GpuMat
&
templ
,
GpuMat
&
result
)
void
matchTemplate_32F_CCORR
(
const
GpuMat
&
image
,
const
GpuMat
&
templ
,
GpuMat
&
result
)
{
Size
opt_size
;
opt_size
.
width
=
getOptimalDFTSize
(
image
.
cols
);
...
...
@@ -234,23 +240,31 @@ namespace
#endif
template
<>
void
matchTemplate
<
CV_8U
,
CV_TM_SQDIFF
>
(
const
GpuMat
&
image
,
const
GpuMat
&
templ
,
GpuMat
&
result
)
void
matchTemplate_8U_SQDIFF
(
const
GpuMat
&
image
,
const
GpuMat
&
templ
,
GpuMat
&
result
)
{
result
.
create
(
image
.
rows
-
templ
.
rows
+
1
,
image
.
cols
-
templ
.
cols
+
1
,
CV_32F
);
imgproc
::
matchTemplateNaive_8U_SQDIFF
(
image
,
templ
,
result
);
}
template
<>
void
matchTemplate
<
CV_8U
,
CV_TM_CCORR
>
(
const
GpuMat
&
image
,
const
GpuMat
&
templ
,
GpuMat
&
result
)
{
GpuMat
imagef
,
templf
;
image
.
convertTo
(
imagef
,
CV_32F
);
templ
.
convertTo
(
templf
,
CV_32F
);
matchTemplate
<
CV_32F
,
CV_TM_SQDIFF
>
(
imagef
,
templf
,
result
);
}
//GpuMat image_sum;
//GpuMat image_sumsq;
//integral(image, image_sum, image_sumsq);
//float templ_sumsq = 0.f;
//matchTemplate_8U_CCORR(image, templ, result);
//imgproc::matchTemplatePrepared_8U_SQDIFF(
// templ.cols, templ.rows, image_sumsq, templ_sumsq, result);
}
void
matchTemplate_8U_CCORR
(
const
GpuMat
&
image
,
const
GpuMat
&
templ
,
GpuMat
&
result
)
{
GpuMat
imagef
,
templf
;
image
.
convertTo
(
imagef
,
CV_32F
);
templ
.
convertTo
(
templf
,
CV_32F
);
matchTemplate_32F_CCORR
(
imagef
,
templf
,
result
);
}
}
...
...
@@ -261,10 +275,10 @@ void cv::gpu::matchTemplate(const GpuMat& image, const GpuMat& templ, GpuMat& re
typedef
void
(
*
Caller
)(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
);
static
const
Caller
callers8U
[]
=
{
::
matchTemplate
<
CV_8U
,
CV_TM_SQDIFF
>
,
0
,
::
matchTemplate
<
CV_8U
,
CV_TM_CCORR
>
,
0
,
0
,
0
};
static
const
Caller
callers32F
[]
=
{
::
matchTemplate
<
CV_32F
,
CV_TM_SQDIFF
>
,
0
,
::
matchTemplate
<
CV_32F
,
CV_TM_CCORR
>
,
0
,
0
,
0
};
static
const
Caller
callers8U
[]
=
{
::
matchTemplate_8U_SQDIFF
,
0
,
::
matchTemplate_8U_CCORR
,
0
,
0
,
0
};
static
const
Caller
callers32F
[]
=
{
::
matchTemplate
_32F_SQDIFF
,
0
,
::
matchTemplate
_32F_CCORR
,
0
,
0
,
0
};
const
Caller
*
callers
;
switch
(
image
.
type
())
...
...
tests/gpu/src/match_template.cpp
View file @
d557c800
...
...
@@ -77,6 +77,8 @@ struct CV_GpuMatchTemplateTest: CvTest
do
h
=
1
+
rand
()
%
30
;
while
(
h
>
n
);
do
w
=
1
+
rand
()
%
30
;
while
(
w
>
m
);
//cout << "w: " << w << " h: " << h << endl;
gen
(
image
,
n
,
m
,
CV_8U
);
gen
(
templ
,
h
,
w
,
CV_8U
);
F
(
t
=
clock
();)
...
...
@@ -107,15 +109,15 @@ struct CV_GpuMatchTemplateTest: CvTest
F
(
cout
<<
"gpu_block: "
<<
clock
()
-
t
<<
endl
;)
if
(
!
check
(
dst_gold
,
Mat
(
dst
),
0.25
f
*
h
*
w
*
1e-5
f
))
return
;
//
gen(image, n, m, CV_32F);
//
gen(templ, h, w, CV_32F);
//
F(t = clock();)
//
matchTemplate(image, templ, dst_gold, CV_TM_CCORR);
//
F(cout << "cpu:" << clock() - t << endl;)
//
F(t = clock();)
//
gpu::matchTemplate(gpu::GpuMat(image), gpu::GpuMat(templ), dst, CV_TM_CCORR);
//
F(cout << "gpu_block: " << clock() - t << endl;)
//
if (!check(dst_gold, Mat(dst), 0.25f * h * w * 1e-5f)) return;
gen
(
image
,
n
,
m
,
CV_32F
);
gen
(
templ
,
h
,
w
,
CV_32F
);
F
(
t
=
clock
();)
matchTemplate
(
image
,
templ
,
dst_gold
,
CV_TM_CCORR
);
F
(
cout
<<
"cpu:"
<<
clock
()
-
t
<<
endl
;)
F
(
t
=
clock
();)
gpu
::
matchTemplate
(
gpu
::
GpuMat
(
image
),
gpu
::
GpuMat
(
templ
),
dst
,
CV_TM_CCORR
);
F
(
cout
<<
"gpu_block: "
<<
clock
()
-
t
<<
endl
;)
if
(
!
check
(
dst_gold
,
Mat
(
dst
),
0.25
f
*
h
*
w
*
1e-5
f
))
return
;
}
}
catch
(
const
Exception
&
e
)
...
...
@@ -153,6 +155,21 @@ struct CV_GpuMatchTemplateTest: CvTest
return
false
;
}
//// Debug check
//for (int i = 0; i < a.rows; ++i)
//{
// for (int j = 0; j < a.cols; ++j)
// {
// float v1 = a.at<float>(i, j);
// float v2 = b.at<float>(i, j);
// if (fabs(v1 - v2) > max_err)
// {
// ts->printf(CvTS::CONSOLE, "%d %d %f %f\n", i, j, v1, v2);
// cin.get();
// }
// }
//}
return
true
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment