Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
020624c4
Commit
020624c4
authored
Aug 26, 2013
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
used new device layer for cv::gpu::minMaxLoc
parent
e1aa2fd0
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
61 additions
and
220 deletions
+61
-220
minmaxloc.cu
modules/cudaarithm/src/cuda/minmaxloc.cu
+61
-170
reductions.cpp
modules/cudaarithm/src/reductions.cpp
+0
-50
No files found.
modules/cudaarithm/src/cuda/minmaxloc.cu
View file @
020624c4
...
@@ -40,197 +40,88 @@
...
@@ -40,197 +40,88 @@
//
//
//M*/
//M*/
#i
f !defined CUDA_DISABLER
#i
nclude "opencv2/opencv_modules.hpp"
#include "opencv2/core/cuda/common.hpp"
#ifndef HAVE_OPENCV_CUDEV
#include "opencv2/core/cuda/vec_traits.hpp"
#include "opencv2/core/cuda/vec_math.hpp"
#include "opencv2/core/cuda/functional.hpp"
#include "opencv2/core/cuda/reduce.hpp"
#include "opencv2/core/cuda/emulation.hpp"
#include "opencv2/core/cuda/limits.hpp"
#include "opencv2/core/cuda/utility.hpp"
using namespace cv::cuda;
#error "opencv_cudev is required"
using namespace cv::cuda::device;
namespace minMaxLoc
#else
{
// To avoid shared bank conflicts we convert each value into value of
// appropriate type (32 bits minimum)
template <typename T> struct MinMaxTypeTraits;
template <> struct MinMaxTypeTraits<unsigned char> { typedef int best_type; };
template <> struct MinMaxTypeTraits<signed char> { typedef int best_type; };
template <> struct MinMaxTypeTraits<unsigned short> { typedef int best_type; };
template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
template <int BLOCK_SIZE, typename T, class Mask>
__global__ void kernel_pass_1(const PtrStepSz<T> src, const Mask mask, T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, const int twidth, const int theight)
{
typedef typename MinMaxTypeTraits<T>::best_type work_type;
__shared__ work_type sminval[BLOCK_SIZE];
__shared__ work_type smaxval[BLOCK_SIZE];
__shared__ unsigned int sminloc[BLOCK_SIZE];
__shared__ unsigned int smaxloc[BLOCK_SIZE];
const int x0 = blockIdx.x * blockDim.x * twidth + threadIdx.x;
#include "opencv2/cudaarithm.hpp"
const int y0 = blockIdx.y * blockDim.y * theight + threadIdx.y;
#include "opencv2/cudev.hpp"
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
using namespace cv::cudev;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
work_type mymin = numeric_limits<work_type>::max();
namespace
work_type mymax = -numeric_limits<work_type>::max();
{
unsigned int myminloc = 0;
template <typename T>
unsigned int mymaxloc = 0;
void minMaxLocImpl(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, double* minVal, double* maxVal, cv::Point* minLoc, cv::Point* maxLoc)
for (int i = 0, y = y0; i < theight && y < src.rows; ++i, y += blockDim.y)
{
const T* ptr = src.ptr(y);
for (int j = 0, x = x0; j < twidth && x < src.cols; ++j, x += blockDim.x)
{
if (mask(y, x))
{
const work_type srcVal = ptr[x];
if (srcVal < mymin)
{
mymin = srcVal;
myminloc = y * src.cols + x;
}
if (srcVal > mymax)
{
mymax = srcVal;
mymaxloc = y * src.cols + x;
}
}
}
}
reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
tid,
thrust::make_tuple(less<work_type>(), greater<work_type>()));
if (tid == 0)
{
minval[bid] = (T) mymin;
maxval[bid] = (T) mymax;
minloc[bid] = myminloc;
maxloc[bid] = mymaxloc;
}
}
template <int BLOCK_SIZE, typename T>
__global__ void kernel_pass_2(T* minval, T* maxval, unsigned int* minloc, unsigned int* maxloc, int count)
{
{
typedef typename MinMaxTypeTraits<T>::best_type work_type;
typedef typename SelectIf<
TypesEquals<T, double>::value,
double,
typename SelectIf<TypesEquals<T, float>::value, float, int>::type
>::type work_type;
const GpuMat_<T>& src = (const GpuMat_<T>&) _src;
GpuMat_<work_type>& valBuf = (GpuMat_<work_type>&) _valBuf;
GpuMat_<int>& locBuf = (GpuMat_<int>&) _locBuf;
if (mask.empty())
gridMinMaxLoc(src, valBuf, locBuf);
else
gridMinMaxLoc(src, valBuf, locBuf, globPtr<uchar>(mask));
__shared__ work_type sminval[BLOCK_SIZE];
cv::Mat_<work_type> h_valBuf;
__shared__ work_type smaxval[BLOCK_SIZE];
cv::Mat_<int> h_locBuf;
__shared__ unsigned int sminloc[BLOCK_SIZE];
__shared__ unsigned int smaxloc[BLOCK_SIZE];
unsigned int idx = ::min(threadIdx.x, count - 1);
valBuf.download(h_valBuf);
locBuf.download(h_locBuf);
work_type mymin = minval[idx];
if (minVal)
work_type mymax = maxval[idx];
*minVal = h_valBuf(0, 0);
unsigned int myminloc = minloc[idx];
unsigned int mymaxloc = maxloc[idx];
reduceKeyVal<BLOCK_SIZE>(smem_tuple(sminval, smaxval), thrust::tie(mymin, mymax),
if (maxVal)
smem_tuple(sminloc, smaxloc), thrust::tie(myminloc, mymaxloc),
*maxVal = h_valBuf(1, 0);
threadIdx.x,
thrust::make_tuple(less<work_type>(), greater<work_type>()));
if (
threadIdx.x == 0
)
if (
minLoc
)
{
{
minval[0] = (T) mymin;
const int idx = h_locBuf(0, 0);
maxval[0] = (T) mymax;
*minLoc = cv::Point(idx % src.cols, idx / src.cols);
minloc[0] = myminloc;
maxloc[0] = mymaxloc;
}
}
}
const int threads_x = 32;
const int threads_y = 8;
void getLaunchCfg(int cols, int rows, dim3& block, dim3& grid)
{
block = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, block.x * block.y),
if (maxLoc)
divUp(rows, block.y * block.x));
{
const int idx = h_locBuf(1, 0);
grid.x = ::min(grid.x, block.x);
*maxLoc = cv::Point(idx % src.cols, idx / src.cols);
grid.y = ::min(grid.y, block.y);
}
}
void getBufSize(int cols, int rows, size_t elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows)
{
dim3 block, grid;
getLaunchCfg(cols, rows, block, grid);
// For values
b1cols = (int)(grid.x * grid.y * elem_size);
b1rows = 2;
// For locations
b2cols = grid.x * grid.y * sizeof(int);
b2rows = 2;
}
}
}
template <typename T>
void cv::cuda::minMaxLoc(InputArray _src, double* minVal, double* maxVal, Point* minLoc, Point* maxLoc, InputArray _mask, GpuMat& valBuf, GpuMat& locBuf)
void run(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf)
{
typedef void (*func_t)(const GpuMat& _src, const GpuMat& mask, GpuMat& _valBuf, GpuMat& _locBuf, double* minVal, double* maxVal, cv::Point* minLoc, cv::Point* maxLoc);
static const func_t funcs[] =
{
{
dim3 block, grid;
minMaxLocImpl<uchar>,
getLaunchCfg(src.cols, src.rows, block, grid);
minMaxLocImpl<schar>,
minMaxLocImpl<ushort>,
const int twidth = divUp(divUp(src.cols, grid.x), block.x);
minMaxLocImpl<short>,
const int theight = divUp(divUp(src.rows, grid.y), block.y);
minMaxLocImpl<int>,
minMaxLocImpl<float>,
minMaxLocImpl<double>
};
T* minval_buf = (T*) valbuf.ptr(0);
GpuMat src = _src.getGpuMat();
T* maxval_buf = (T*) valbuf.ptr(1);
GpuMat mask = _mask.getGpuMat();
unsigned int* minloc_buf = locbuf.ptr(0);
unsigned int* maxloc_buf = locbuf.ptr(1);
if (mask.data)
kernel_pass_1<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, SingleMask(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
else
kernel_pass_1<threads_x * threads_y><<<grid, block>>>((PtrStepSz<T>) src, WithOutMask(), minval_buf, maxval_buf, minloc_buf, maxloc_buf, twidth, theight);
cudaSafeCall( cudaGetLastError() );
CV_Assert( src.channels() == 1 );
CV_DbgAssert( mask.empty() || (mask.size() == src.size() && mask.type() == CV_8U) );
kernel_pass_2<threads_x * threads_y><<<1, threads_x * threads_y>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
const func_t func = funcs[src.depth()];
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
T minval_, maxval_;
cudaSafeCall( cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost) );
*minval = minval_;
*maxval = maxval_;
unsigned int minloc_, maxloc_;
cudaSafeCall( cudaMemcpy(&minloc_, minloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaMemcpy(&maxloc_, maxloc_buf, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void run<unsigned char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
func(src, mask, valBuf, locBuf, minVal, maxVal, minLoc, maxLoc);
template void run<signed char >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<unsigned short>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<short >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<int >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<float >(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
template void run<double>(const PtrStepSzb src, const PtrStepb mask, double* minval, double* maxval, int* minloc, int* maxloc, PtrStepb valbuf, PtrStep<unsigned int> locbuf);
}
}
#endif
// CUDA_DISABLER
#endif
modules/cudaarithm/src/reductions.cpp
View file @
020624c4
...
@@ -186,56 +186,6 @@ double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normT
...
@@ -186,56 +186,6 @@ double cv::cuda::norm(InputArray _src1, InputArray _src2, GpuMat& buf, int normT
return
retVal
;
return
retVal
;
}
}
////////////////////////////////////////////////////////////////////////
// minMaxLoc
namespace
minMaxLoc
{
void
getBufSize
(
int
cols
,
int
rows
,
size_t
elem_size
,
int
&
b1cols
,
int
&
b1rows
,
int
&
b2cols
,
int
&
b2rows
);
template
<
typename
T
>
void
run
(
const
PtrStepSzb
src
,
const
PtrStepb
mask
,
double
*
minval
,
double
*
maxval
,
int
*
minloc
,
int
*
maxloc
,
PtrStepb
valbuf
,
PtrStep
<
unsigned
int
>
locbuf
);
}
void
cv
::
cuda
::
minMaxLoc
(
InputArray
_src
,
double
*
minVal
,
double
*
maxVal
,
Point
*
minLoc
,
Point
*
maxLoc
,
InputArray
_mask
,
GpuMat
&
valBuf
,
GpuMat
&
locBuf
)
{
GpuMat
src
=
_src
.
getGpuMat
();
GpuMat
mask
=
_mask
.
getGpuMat
();
typedef
void
(
*
func_t
)(
const
PtrStepSzb
src
,
const
PtrStepb
mask
,
double
*
minval
,
double
*
maxval
,
int
*
minloc
,
int
*
maxloc
,
PtrStepb
valbuf
,
PtrStep
<
unsigned
int
>
locbuf
);
static
const
func_t
funcs
[]
=
{
::
minMaxLoc
::
run
<
uchar
>
,
::
minMaxLoc
::
run
<
schar
>
,
::
minMaxLoc
::
run
<
ushort
>
,
::
minMaxLoc
::
run
<
short
>
,
::
minMaxLoc
::
run
<
int
>
,
::
minMaxLoc
::
run
<
float
>
,
::
minMaxLoc
::
run
<
double
>
};
CV_Assert
(
src
.
channels
()
==
1
);
CV_Assert
(
mask
.
empty
()
||
(
mask
.
size
()
==
src
.
size
()
&&
mask
.
type
()
==
CV_8U
)
);
if
(
src
.
depth
()
==
CV_64F
)
{
if
(
!
deviceSupports
(
NATIVE_DOUBLE
))
CV_Error
(
cv
::
Error
::
StsUnsupportedFormat
,
"The device doesn't support double"
);
}
Size
valbuf_size
,
locbuf_size
;
::
minMaxLoc
::
getBufSize
(
src
.
cols
,
src
.
rows
,
src
.
elemSize
(),
valbuf_size
.
width
,
valbuf_size
.
height
,
locbuf_size
.
width
,
locbuf_size
.
height
);
ensureSizeIsEnough
(
valbuf_size
,
CV_8U
,
valBuf
);
ensureSizeIsEnough
(
locbuf_size
,
CV_8U
,
locBuf
);
const
func_t
func
=
funcs
[
src
.
depth
()];
double
temp1
,
temp2
;
Point
temp3
,
temp4
;
func
(
src
,
mask
,
minVal
?
minVal
:
&
temp1
,
maxVal
?
maxVal
:
&
temp2
,
minLoc
?
&
minLoc
->
x
:
&
temp3
.
x
,
maxLoc
?
&
maxLoc
->
x
:
&
temp4
.
x
,
valBuf
,
locBuf
);
}
//////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////
// countNonZero
// countNonZero
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment