Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
31a78143
Commit
31a78143
authored
Aug 26, 2013
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
used new device layer for cv::gpu::countNonZero
parent
020624c4
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
35 additions
and
159 deletions
+35
-159
countnonzero.cu
modules/cudaarithm/src/cuda/countnonzero.cu
+35
-115
reductions.cpp
modules/cudaarithm/src/reductions.cpp
+0
-44
No files found.
modules/cudaarithm/src/cuda/countnonzero.cu
View file @
31a78143
...
...
@@ -40,137 +40,57 @@
//
//M*/
#i
f !defined CUDA_DISABLER
#i
nclude "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/emulation.hpp"
#ifndef HAVE_OPENCV_CUDEV
using namespace cv::cuda;
using namespace cv::cuda::device;
#error "opencv_cudev is required"
namespace countNonZero
{
__device__ unsigned int blocks_finished = 0;
template <int BLOCK_SIZE, typename T>
__global__ void kernel(const PtrStepSz<T> src, unsigned int* count, const int twidth, const int theight)
{
__shared__ unsigned int scount[BLOCK_SIZE];
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
unsigned int mycount = 0;
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
{
const T* ptr = src.ptr(y);
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
{
const T srcVal = ptr[x];
mycount += (srcVal != 0);
}
}
device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
#if __CUDA_ARCH__ >= 200
if (tid == 0)
::atomicAdd(count, mycount);
#else
__shared__ bool is_last;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
if (tid == 0)
{
count[bid] = mycount;
__threadfence();
unsigned int ticket = ::atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
mycount = tid < gridDim.x * gridDim.y ? count[tid] : 0;
#else
device::reduce<BLOCK_SIZE>(scount, mycount, tid, plus<unsigned int>());
#include "opencv2/cudaarithm.hpp"
#include "opencv2/cudev.hpp"
if (tid == 0)
{
count[0] = mycount;
using namespace cv::cudev;
blocks_finished = 0;
}
}
#endif
}
const int threads_x = 32;
const int threads_y = 8;
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
namespace
{
template <typename T>
int countNonZeroImpl(const GpuMat& _src, GpuMat& _buf)
{
block = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, block.x * block.y),
divUp(rows, block.y * block.x));
const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
GpuMat_<int>& buf = (GpuMat_<int>&) _buf;
grid.x = ::min(grid.x, block.x);
grid.y = ::min(grid.y, block.y);
}
gridCountNonZero(src, buf);
void getBufSize(int cols, int rows, int& bufcols, int& bufrows)
{
dim3 block, grid;
getLaunchCfg(cols, rows, block, grid);
int data;
buf.download(cv::Mat(1, 1, buf.type(), &data));
bufcols = grid.x * grid.y * sizeof(int);
bufrows = 1;
return data;
}
}
template <typename T>
int run(const PtrStepSzb src, PtrStep<unsigned int> buf)
int cv::cuda::countNonZero(InputArray _src, GpuMat& buf)
{
typedef int (*func_t)(const GpuMat& _src, GpuMat& _buf);
static const func_t funcs[] =
{
dim3 block, grid;
getLaunchCfg(src.cols, src.rows, block, grid);
const int twidth = divUp(divUp(src.cols, grid.x), block.x);
const int theight = divUp(divUp(src.rows, grid.y), block.y);
countNonZeroImpl<uchar>,
countNonZeroImpl<schar>,
countNonZeroImpl<ushort>,
countNonZeroImpl<short>,
countNonZeroImpl<int>,
countNonZeroImpl<float>,
countNonZeroImpl<double>
};
unsigned int* count_buf = buf.ptr(0
);
GpuMat src = _src.getGpuMat(
);
cudaSafeCall( cudaMemset(count_buf, 0, sizeof(unsigned int))
);
CV_Assert( src.channels() == 1
);
kernel<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, count_buf, twidth, theight);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
unsigned int count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost));
return count;
}
const func_t func = funcs[src.depth()];
template int run<uchar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<schar >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<ushort>(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<short >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<int >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<float >(const PtrStepSzb src, PtrStep<unsigned int> buf);
template int run<double>(const PtrStepSzb src, PtrStep<unsigned int> buf);
return func(src, buf);
}
#endif
// CUDA_DISABLER
#endif
modules/cudaarithm/src/reductions.cpp
View file @
31a78143
...
...
@@ -186,50 +186,6 @@ double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normT
return
retVal
;
}
//////////////////////////////////////////////////////////////////////////////
// countNonZero
namespace
countNonZero
{
void
getBufSize
(
int
cols
,
int
rows
,
int
&
bufcols
,
int
&
bufrows
);
template
<
typename
T
>
int
run
(
const
PtrStepSzb
src
,
PtrStep
<
unsigned
int
>
buf
);
}
int
cv
::
cuda
::
countNonZero
(
InputArray
_src
,
GpuMat
&
buf
)
{
GpuMat
src
=
_src
.
getGpuMat
();
typedef
int
(
*
func_t
)(
const
PtrStepSzb
src
,
PtrStep
<
unsigned
int
>
buf
);
static
const
func_t
funcs
[]
=
{
::
countNonZero
::
run
<
uchar
>
,
::
countNonZero
::
run
<
schar
>
,
::
countNonZero
::
run
<
ushort
>
,
::
countNonZero
::
run
<
short
>
,
::
countNonZero
::
run
<
int
>
,
::
countNonZero
::
run
<
float
>
,
::
countNonZero
::
run
<
double
>
};
CV_Assert
(
src
.
channels
()
==
1
);
if
(
src
.
depth
()
==
CV_64F
)
{
if
(
!
deviceSupports
(
NATIVE_DOUBLE
))
CV_Error
(
cv
::
Error
::
StsUnsupportedFormat
,
"The device doesn't support double"
);
}
Size
buf_size
;
::
countNonZero
::
getBufSize
(
src
.
cols
,
src
.
rows
,
buf_size
.
width
,
buf_size
.
height
);
ensureSizeIsEnough
(
buf_size
,
CV_8U
,
buf
);
const
func_t
func
=
funcs
[
src
.
depth
()];
return
func
(
src
,
buf
);
}
//////////////////////////////////////////////////////////////////////////////
// reduce
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment