Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
72f020a8
Commit
72f020a8
authored
Nov 29, 2010
by
Alexey Spizhevoy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
added gpu::count_non_zero version for CC1.0, refactored gpu module a little
parent
120a3b75
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
117 additions
and
35 deletions
+117
-35
gpu.hpp
modules/gpu/include/opencv2/gpu/gpu.hpp
+3
-0
arithm.cpp
modules/gpu/src/arithm.cpp
+26
-8
mathfunc.cu
modules/gpu/src/cuda/mathfunc.cu
+68
-13
initialization.cpp
modules/gpu/src/initialization.cpp
+16
-0
arithm.cpp
tests/gpu/src/arithm.cpp
+4
-14
No files found.
modules/gpu/include/opencv2/gpu/gpu.hpp
View file @
72f020a8
...
...
@@ -68,6 +68,9 @@ namespace cv
CV_EXPORTS
void
getGpuMemInfo
(
size_t
&
free
,
size_t
&
total
);
CV_EXPORTS
bool
hasNativeDoubleSupport
(
int
device
);
CV_EXPORTS
bool
hasAtomicsSupport
(
int
device
);
//////////////////////////////// Error handling ////////////////////////
CV_EXPORTS
void
error
(
const
char
*
error_string
,
const
char
*
file
,
const
int
line
,
const
char
*
func
);
...
...
modules/gpu/src/arithm.cpp
View file @
72f020a8
...
...
@@ -665,15 +665,33 @@ int cv::gpu::countNonZero(const GpuMat& src, GpuMat& buf)
get_buf_size_required
(
buf_size
.
width
,
buf_size
.
height
);
buf
.
create
(
buf_size
,
CV_8U
);
switch
(
src
.
type
())
int
device
=
getDevice
();
if
(
hasAtomicsSupport
(
device
))
{
switch
(
src
.
type
())
{
case
CV_8U
:
return
count_non_zero_caller
<
unsigned
char
>
(
src
,
buf
);
case
CV_8S
:
return
count_non_zero_caller
<
signed
char
>
(
src
,
buf
);
case
CV_16U
:
return
count_non_zero_caller
<
unsigned
short
>
(
src
,
buf
);
case
CV_16S
:
return
count_non_zero_caller
<
signed
short
>
(
src
,
buf
);
case
CV_32S
:
return
count_non_zero_caller
<
int
>
(
src
,
buf
);
case
CV_32F
:
return
count_non_zero_caller
<
float
>
(
src
,
buf
);
case
CV_64F
:
if
(
hasNativeDoubleSupport
(
device
))
return
count_non_zero_caller
<
double
>
(
src
,
buf
);
}
}
else
{
case
CV_8U
:
return
count_non_zero_caller
<
unsigned
char
>
(
src
,
buf
);
case
CV_8S
:
return
count_non_zero_caller
<
signed
char
>
(
src
,
buf
);
case
CV_16U
:
return
count_non_zero_caller
<
unsigned
short
>
(
src
,
buf
);
case
CV_16S
:
return
count_non_zero_caller
<
signed
short
>
(
src
,
buf
);
case
CV_32S
:
return
count_non_zero_caller
<
int
>
(
src
,
buf
);
case
CV_32F
:
return
count_non_zero_caller
<
float
>
(
src
,
buf
);
case
CV_64F
:
return
count_non_zero_caller
<
double
>
(
src
,
buf
);
switch
(
src
.
type
())
{
case
CV_8U
:
return
count_non_zero_caller_2steps
<
unsigned
char
>
(
src
,
buf
);
case
CV_8S
:
return
count_non_zero_caller_2steps
<
signed
char
>
(
src
,
buf
);
case
CV_16U
:
return
count_non_zero_caller_2steps
<
unsigned
short
>
(
src
,
buf
);
case
CV_16S
:
return
count_non_zero_caller_2steps
<
signed
short
>
(
src
,
buf
);
case
CV_32S
:
return
count_non_zero_caller_2steps
<
int
>
(
src
,
buf
);
case
CV_32F
:
return
count_non_zero_caller_2steps
<
float
>
(
src
,
buf
);
}
}
CV_Error
(
CV_StsBadArg
,
"countNonZero: unsupported type"
);
...
...
modules/gpu/src/cuda/mathfunc.cu
View file @
72f020a8
...
...
@@ -908,6 +908,27 @@ namespace cv { namespace gpu { namespace mathfunc
}
template <int size, typename T>
__device__ void sum_shared_mem(volatile T* data, const unsigned int tid)
{
T sum = data[tid];
if (size >= 512) if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads();
if (size >= 256) if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads();
if (size >= 128) if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads();
if (tid < 32)
{
if (size >= 64) data[tid] = sum = sum + data[tid + 32];
if (size >= 32) data[tid] = sum = sum + data[tid + 16];
if (size >= 16) data[tid] = sum = sum + data[tid + 8];
if (size >= 8) data[tid] = sum = sum + data[tid + 4];
if (size >= 4) data[tid] = sum = sum + data[tid + 2];
if (size >= 2) data[tid] = sum = sum + data[tid + 1];
}
}
template <int nthreads, typename T>
__global__ void count_non_zero_kernel(const DevMem2D src, volatile unsigned int* count)
{
...
...
@@ -928,12 +949,9 @@ namespace cv { namespace gpu { namespace mathfunc
scount[tid] = cnt;
__syncthreads();
for (unsigned int step = nthreads / 2; step > 0; step >>= 1)
{
if (tid < step) scount[tid] += scount[tid + step];
__syncthreads();
}
sum_shared_mem<nthreads, unsigned int>(scount, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
...
...
@@ -950,16 +968,12 @@ namespace cv { namespace gpu { namespace mathfunc
if (is_last)
{
scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;
for (unsigned int step = nthreads / 2; step > 0; step >>= 1)
{
if (tid < step) scount[tid] += scount[tid + step];
__syncthreads();
}
sum_shared_mem<nthreads, unsigned int>(scount, tid);
if (tid == 0) count[0] = scount[0];
}
#else
if (tid == 0) count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
#endif
}
...
...
@@ -990,6 +1004,47 @@ namespace cv { namespace gpu { namespace mathfunc
template int count_non_zero_caller<float>(const DevMem2D, PtrStep);
template int count_non_zero_caller<double>(const DevMem2D, PtrStep);
template <int nthreads, typename T>
__global__ void count_non_zero_kernel_2ndstep(unsigned int* count, int size)
{
__shared__ unsigned int scount[nthreads];
unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
scount[tid] = tid < size ? count[tid] : 0;
sum_shared_mem<nthreads, unsigned int>(scount, tid);
if (tid == 0) count[0] = scount[0];
}
template <typename T>
int count_non_zero_caller_2steps(const DevMem2D src, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(threads, grid);
estimate_kernel_consts(src.cols, src.rows, threads, grid);
unsigned int* count_buf = (unsigned int*)buf.ptr(0);
cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
count_non_zero_kernel_2ndstep<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
unsigned int count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
return count;
}
template int count_non_zero_caller_2steps<unsigned char>(const DevMem2D, PtrStep);
template int count_non_zero_caller_2steps<signed char>(const DevMem2D, PtrStep);
template int count_non_zero_caller_2steps<unsigned short>(const DevMem2D, PtrStep);
template int count_non_zero_caller_2steps<signed short>(const DevMem2D, PtrStep);
template int count_non_zero_caller_2steps<int>(const DevMem2D, PtrStep);
template int count_non_zero_caller_2steps<float>(const DevMem2D, PtrStep);
} // namespace countnonzero
}}}
modules/gpu/src/initialization.cpp
View file @
72f020a8
...
...
@@ -55,6 +55,8 @@ CV_EXPORTS int cv::gpu::getDevice() { throw_nogpu(); return 0; }
CV_EXPORTS
void
cv
::
gpu
::
getComputeCapability
(
int
/*device*/
,
int
&
/*major*/
,
int
&
/*minor*/
)
{
throw_nogpu
();
}
CV_EXPORTS
int
cv
::
gpu
::
getNumberOfSMs
(
int
/*device*/
)
{
throw_nogpu
();
return
0
;
}
CV_EXPORTS
void
cv
::
gpu
::
getGpuMemInfo
(
size_t
&
/*free*/
,
size_t
&
/*total*/
)
{
throw_nogpu
();
}
CV_EXPORTS
bool
cv
::
gpu
::
hasNativeDoubleSupport
(
int
/*device*/
)
{
throw_nogpu
();
return
false
;
}
CV_EXPORTS
bool
cv
::
gpu
::
hasAtomicsSupport
(
int
/*device*/
)
{
throw_nogpu
();
return
false
;
}
#else
/* !defined (HAVE_CUDA) */
...
...
@@ -106,5 +108,19 @@ CV_EXPORTS void cv::gpu::getGpuMemInfo(size_t& free, size_t& total)
cudaSafeCall
(
cudaMemGetInfo
(
&
free
,
&
total
)
);
}
CV_EXPORTS
bool
cv
::
gpu
::
hasNativeDoubleSupport
(
int
device
)
{
int
major
,
minor
;
getComputeCapability
(
device
,
major
,
minor
);
return
major
>
1
||
(
major
==
1
&&
minor
>=
3
);
}
CV_EXPORTS
bool
cv
::
gpu
::
hasAtomicsSupport
(
int
device
)
{
int
major
,
minor
;
getComputeCapability
(
device
,
major
,
minor
);
return
major
>
1
||
(
major
==
1
&&
minor
>=
1
);
}
#endif
tests/gpu/src/arithm.cpp
View file @
72f020a8
...
...
@@ -681,11 +681,7 @@ struct CV_GpuMinMaxTest: public CvTest
void
run
(
int
)
{
int
depth_end
;
int
major
,
minor
;
cv
::
gpu
::
getComputeCapability
(
getDevice
(),
major
,
minor
);
if
(
minor
>=
1
)
depth_end
=
CV_64F
;
else
depth_end
=
CV_32F
;
if
(
cv
::
gpu
::
hasNativeDoubleSupport
(
cv
::
gpu
::
getDevice
()))
depth_end
=
CV_64F
;
else
depth_end
=
CV_32F
;
for
(
int
cn
=
1
;
cn
<=
4
;
++
cn
)
for
(
int
depth
=
CV_8U
;
depth
<=
depth_end
;
++
depth
)
{
...
...
@@ -760,10 +756,7 @@ struct CV_GpuMinMaxLocTest: public CvTest
void
run
(
int
)
{
int
depth_end
;
int
major
,
minor
;
cv
::
gpu
::
getComputeCapability
(
getDevice
(),
major
,
minor
);
if
(
minor
>=
1
)
depth_end
=
CV_64F
;
else
depth_end
=
CV_32F
;
if
(
cv
::
gpu
::
hasNativeDoubleSupport
(
cv
::
gpu
::
getDevice
()))
depth_end
=
CV_64F
;
else
depth_end
=
CV_32F
;
for
(
int
depth
=
CV_8U
;
depth
<=
depth_end
;
++
depth
)
{
int
rows
=
1
,
cols
=
3
;
...
...
@@ -829,11 +822,8 @@ struct CV_GpuCountNonZeroTest: CvTest
{
srand
(
0
);
int
depth_end
;
int
major
,
minor
;
cv
::
gpu
::
getComputeCapability
(
getDevice
(),
major
,
minor
);
if
(
minor
>=
1
)
depth_end
=
CV_64F
;
else
depth_end
=
CV_32F
;
for
(
int
depth
=
CV_8U
;
depth
<=
depth_end
;
++
depth
)
if
(
cv
::
gpu
::
hasNativeDoubleSupport
(
cv
::
gpu
::
getDevice
()))
depth_end
=
CV_64F
;
else
depth_end
=
CV_32F
;
for
(
int
depth
=
CV_8U
;
depth
<=
CV_32F
;
++
depth
)
{
for
(
int
i
=
0
;
i
<
4
;
++
i
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment