Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
df852937
Commit
df852937
authored
Dec 20, 2010
by
Alexey Spizhevoy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
refactoring: moved gpu reduction-based functions into separated file
parent
1922e50f
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
2398 additions
and
2279 deletions
+2398
-2279
gpu.hpp
modules/gpu/include/opencv2/gpu/gpu.hpp
+74
-71
arithm.cpp
modules/gpu/src/arithm.cpp
+0
-508
element_operations.cu
modules/gpu/src/cuda/element_operations.cu
+123
-0
mathfunc.cu
modules/gpu/src/cuda/mathfunc.cu
+19
-1696
matrix_reductions.cu
modules/gpu/src/cuda/matrix_reductions.cu
+1610
-0
element_operations.cpp
modules/gpu/src/element_operations.cpp
+149
-4
matrix_reductions.cpp
modules/gpu/src/matrix_reductions.cpp
+423
-0
No files found.
modules/gpu/include/opencv2/gpu/gpu.hpp
View file @
df852937
...
...
@@ -360,66 +360,17 @@ namespace cv
friend
struct
StreamAccessor
;
};
////////////////////////////// Arithmetics ///////////////////////////////////
//! transposes the matrix
//! supports CV_8UC1, CV_8SC1, CV_8UC4, CV_8SC4, CV_16UC2, CV_16SC2, CV_32SC1, CV_32FC1 type
CV_EXPORTS
void
transpose
(
const
GpuMat
&
src1
,
GpuMat
&
dst
);
//! computes mean value and standard deviation of all or selected array elements
//! supports only CV_8UC1 type
CV_EXPORTS
void
meanStdDev
(
const
GpuMat
&
mtx
,
Scalar
&
mean
,
Scalar
&
stddev
);
//! computes norm of array
//! supports NORM_INF, NORM_L1, NORM_L2
//! supports only CV_8UC1 type
CV_EXPORTS
double
norm
(
const
GpuMat
&
src1
,
int
normType
=
NORM_L2
);
//! computes norm of the difference between two arrays
//! supports NORM_INF, NORM_L1, NORM_L2
//! supports only CV_8UC1 type
CV_EXPORTS
double
norm
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
int
normType
=
NORM_L2
);
//! reverses the order of the rows, columns or both in a matrix
//! supports CV_8UC1, CV_8UC4 types
CV_EXPORTS
void
flip
(
const
GpuMat
&
a
,
GpuMat
&
b
,
int
flipCode
);
//! computes sum of array elements
//! supports only single channel images
CV_EXPORTS
Scalar
sum
(
const
GpuMat
&
src
);
//! computes sum of array elements
//! supports only single channel images
CV_EXPORTS
Scalar
sum
(
const
GpuMat
&
src
,
GpuMat
&
buf
);
//! computes squared sum of array elements
//! supports only single channel images
CV_EXPORTS
Scalar
sqrSum
(
const
GpuMat
&
src
);
//! computes squared sum of array elements
//! supports only single channel images
CV_EXPORTS
Scalar
sqrSum
(
const
GpuMat
&
src
,
GpuMat
&
buf
);
//! finds global minimum and maximum array elements and returns their values
CV_EXPORTS
void
minMax
(
const
GpuMat
&
src
,
double
*
minVal
,
double
*
maxVal
=
0
,
const
GpuMat
&
mask
=
GpuMat
());
//! finds global minimum and maximum array elements and returns their values
CV_EXPORTS
void
minMax
(
const
GpuMat
&
src
,
double
*
minVal
,
double
*
maxVal
,
const
GpuMat
&
mask
,
GpuMat
&
buf
);
//! finds global minimum and maximum array elements and returns their values with locations
CV_EXPORTS
void
minMaxLoc
(
const
GpuMat
&
src
,
double
*
minVal
,
double
*
maxVal
=
0
,
Point
*
minLoc
=
0
,
Point
*
maxLoc
=
0
,
const
GpuMat
&
mask
=
GpuMat
());
//! finds global minimum and maximum array elements and returns their values with locations
CV_EXPORTS
void
minMaxLoc
(
const
GpuMat
&
src
,
double
*
minVal
,
double
*
maxVal
,
Point
*
minLoc
,
Point
*
maxLoc
,
const
GpuMat
&
mask
,
GpuMat
&
valbuf
,
GpuMat
&
locbuf
);
//! counts non-zero array elements
CV_EXPORTS
int
countNonZero
(
const
GpuMat
&
src
);
//! counts non-zero array elements
CV_EXPORTS
int
countNonZero
(
const
GpuMat
&
src
,
GpuMat
&
buf
);
//! transforms 8-bit unsigned integers using lookup table: dst(i)=lut(src(i))
//! destination array will have the depth type as lut and the same channels number as source
//! supports CV_8UC1, CV_8UC3 types
...
...
@@ -487,25 +438,6 @@ namespace cv
//! async version
CV_EXPORTS
void
polarToCart
(
const
GpuMat
&
magnitude
,
const
GpuMat
&
angle
,
GpuMat
&
x
,
GpuMat
&
y
,
bool
angleInDegrees
,
const
Stream
&
stream
);
//! computes per-element minimum of two arrays (dst = min(src1, src2))
CV_EXPORTS
void
min
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
);
//! Async version
CV_EXPORTS
void
min
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
const
Stream
&
stream
);
//! computes per-element minimum of array and scalar (dst = min(src1, src2))
CV_EXPORTS
void
min
(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
);
//! Async version
CV_EXPORTS
void
min
(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
const
Stream
&
stream
);
//! computes per-element maximum of two arrays (dst = max(src1, src2))
CV_EXPORTS
void
max
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
);
//! Async version
CV_EXPORTS
void
max
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
const
Stream
&
stream
);
//! computes per-element maximum of array and scalar (dst = max(src1, src2))
CV_EXPORTS
void
max
(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
);
//! Async version
CV_EXPORTS
void
max
(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
const
Stream
&
stream
);
//////////////////////////// Per-element operations ////////////////////////////////////
...
...
@@ -576,6 +508,26 @@ namespace cv
//! async version
CV_EXPORTS
void
bitwise_xor
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
const
GpuMat
&
mask
,
const
Stream
&
stream
);
//! computes per-element minimum of two arrays (dst = min(src1, src2))
CV_EXPORTS
void
min
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
);
//! Async version
CV_EXPORTS
void
min
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
const
Stream
&
stream
);
//! computes per-element minimum of array and scalar (dst = min(src1, src2))
CV_EXPORTS
void
min
(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
);
//! Async version
CV_EXPORTS
void
min
(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
const
Stream
&
stream
);
//! computes per-element maximum of two arrays (dst = max(src1, src2))
CV_EXPORTS
void
max
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
);
//! Async version
CV_EXPORTS
void
max
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
const
Stream
&
stream
);
//! computes per-element maximum of array and scalar (dst = max(src1, src2))
CV_EXPORTS
void
max
(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
);
//! Async version
CV_EXPORTS
void
max
(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
const
Stream
&
stream
);
////////////////////////////// Image processing //////////////////////////////
...
...
@@ -663,15 +615,66 @@ namespace cv
//! computes Harris cornerness criteria at each image pixel
CV_EXPORTS
void
cornerHarris
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
int
blockSize
,
int
ksize
,
double
k
,
int
borderType
=
BORDER_REFLECT101
);
//! computes minimum eigen value of 2x2 derivative covariation matrix at each pixel - the cornerness criteria
CV_EXPORTS
void
cornerMinEigenVal
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
int
blockSize
,
int
ksize
,
int
borderType
=
BORDER_REFLECT101
);
//! computes the proximity map for the raster template and the image where the template is searched for
CV_EXPORTS
void
matchTemplate
(
const
GpuMat
&
image
,
const
GpuMat
&
templ
,
GpuMat
&
result
,
int
method
);
////////////////////////////// Matrix reductions //////////////////////////////
//! computes mean value and standard deviation of all or selected array elements
//! supports only CV_8UC1 type
CV_EXPORTS
void
meanStdDev
(
const
GpuMat
&
mtx
,
Scalar
&
mean
,
Scalar
&
stddev
);
//! computes norm of array
//! supports NORM_INF, NORM_L1, NORM_L2
//! supports only CV_8UC1 type
CV_EXPORTS
double
norm
(
const
GpuMat
&
src1
,
int
normType
=
NORM_L2
);
//! computes norm of the difference between two arrays
//! supports NORM_INF, NORM_L1, NORM_L2
//! supports only CV_8UC1 type
CV_EXPORTS
double
norm
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
int
normType
=
NORM_L2
);
//! computes sum of array elements
//! supports only single channel images
CV_EXPORTS
Scalar
sum
(
const
GpuMat
&
src
);
//! computes sum of array elements
//! supports only single channel images
CV_EXPORTS
Scalar
sum
(
const
GpuMat
&
src
,
GpuMat
&
buf
);
//! computes squared sum of array elements
//! supports only single channel images
CV_EXPORTS
Scalar
sqrSum
(
const
GpuMat
&
src
);
//! computes squared sum of array elements
//! supports only single channel images
CV_EXPORTS
Scalar
sqrSum
(
const
GpuMat
&
src
,
GpuMat
&
buf
);
//! finds global minimum and maximum array elements and returns their values
CV_EXPORTS
void
minMax
(
const
GpuMat
&
src
,
double
*
minVal
,
double
*
maxVal
=
0
,
const
GpuMat
&
mask
=
GpuMat
());
//! finds global minimum and maximum array elements and returns their values
CV_EXPORTS
void
minMax
(
const
GpuMat
&
src
,
double
*
minVal
,
double
*
maxVal
,
const
GpuMat
&
mask
,
GpuMat
&
buf
);
//! finds global minimum and maximum array elements and returns their values with locations
CV_EXPORTS
void
minMaxLoc
(
const
GpuMat
&
src
,
double
*
minVal
,
double
*
maxVal
=
0
,
Point
*
minLoc
=
0
,
Point
*
maxLoc
=
0
,
const
GpuMat
&
mask
=
GpuMat
());
//! finds global minimum and maximum array elements and returns their values with locations
CV_EXPORTS
void
minMaxLoc
(
const
GpuMat
&
src
,
double
*
minVal
,
double
*
maxVal
,
Point
*
minLoc
,
Point
*
maxLoc
,
const
GpuMat
&
mask
,
GpuMat
&
valbuf
,
GpuMat
&
locbuf
);
//! counts non-zero array elements
CV_EXPORTS
int
countNonZero
(
const
GpuMat
&
src
);
//! counts non-zero array elements
CV_EXPORTS
int
countNonZero
(
const
GpuMat
&
src
,
GpuMat
&
buf
);
//////////////////////////////// Filter Engine ////////////////////////////////
/*!
...
...
modules/gpu/src/arithm.cpp
View file @
df852937
...
...
@@ -49,20 +49,7 @@ using namespace std;
#if !defined (HAVE_CUDA)
void
cv
::
gpu
::
transpose
(
const
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
meanStdDev
(
const
GpuMat
&
,
Scalar
&
,
Scalar
&
)
{
throw_nogpu
();
}
double
cv
::
gpu
::
norm
(
const
GpuMat
&
,
int
)
{
throw_nogpu
();
return
0.0
;
}
double
cv
::
gpu
::
norm
(
const
GpuMat
&
,
const
GpuMat
&
,
int
)
{
throw_nogpu
();
return
0.0
;
}
void
cv
::
gpu
::
flip
(
const
GpuMat
&
,
GpuMat
&
,
int
)
{
throw_nogpu
();
}
Scalar
cv
::
gpu
::
sum
(
const
GpuMat
&
)
{
throw_nogpu
();
return
Scalar
();
}
Scalar
cv
::
gpu
::
sum
(
const
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
return
Scalar
();
}
Scalar
cv
::
gpu
::
sqrSum
(
const
GpuMat
&
)
{
throw_nogpu
();
return
Scalar
();
}
Scalar
cv
::
gpu
::
sqrSum
(
const
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
return
Scalar
();
}
void
cv
::
gpu
::
minMax
(
const
GpuMat
&
,
double
*
,
double
*
,
const
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
minMax
(
const
GpuMat
&
,
double
*
,
double
*
,
const
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
minMaxLoc
(
const
GpuMat
&
,
double
*
,
double
*
,
Point
*
,
Point
*
,
const
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
minMaxLoc
(
const
GpuMat
&
,
double
*
,
double
*
,
Point
*
,
Point
*
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
}
int
cv
::
gpu
::
countNonZero
(
const
GpuMat
&
)
{
throw_nogpu
();
return
0
;
}
int
cv
::
gpu
::
countNonZero
(
const
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
return
0
;
}
void
cv
::
gpu
::
LUT
(
const
GpuMat
&
,
const
Mat
&
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
exp
(
const
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
log
(
const
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
}
...
...
@@ -78,14 +65,6 @@ void cv::gpu::cartToPolar(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, bool)
void
cv
::
gpu
::
cartToPolar
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
bool
,
const
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
polarToCart
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
polarToCart
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
bool
,
const
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
min
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
min
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
const
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
min
(
const
GpuMat
&
,
double
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
min
(
const
GpuMat
&
,
double
,
GpuMat
&
,
const
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
max
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
max
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
const
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
max
(
const
GpuMat
&
,
double
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
max
(
const
GpuMat
&
,
double
,
GpuMat
&
,
const
Stream
&
)
{
throw_nogpu
();
}
#else
/* !defined (HAVE_CUDA) */
...
...
@@ -118,54 +97,6 @@ void cv::gpu::transpose(const GpuMat& src, GpuMat& dst)
}
}
////////////////////////////////////////////////////////////////////////
// meanStdDev
void
cv
::
gpu
::
meanStdDev
(
const
GpuMat
&
src
,
Scalar
&
mean
,
Scalar
&
stddev
)
{
CV_Assert
(
src
.
type
()
==
CV_8UC1
);
NppiSize
sz
;
sz
.
width
=
src
.
cols
;
sz
.
height
=
src
.
rows
;
nppSafeCall
(
nppiMean_StdDev_8u_C1R
(
src
.
ptr
<
Npp8u
>
(),
src
.
step
,
sz
,
mean
.
val
,
stddev
.
val
)
);
}
////////////////////////////////////////////////////////////////////////
// norm
double
cv
::
gpu
::
norm
(
const
GpuMat
&
src1
,
int
normType
)
{
return
norm
(
src1
,
GpuMat
(
src1
.
size
(),
src1
.
type
(),
Scalar
::
all
(
0.0
)),
normType
);
}
double
cv
::
gpu
::
norm
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
int
normType
)
{
CV_DbgAssert
(
src1
.
size
()
==
src2
.
size
()
&&
src1
.
type
()
==
src2
.
type
());
CV_Assert
(
src1
.
type
()
==
CV_8UC1
);
CV_Assert
(
normType
==
NORM_INF
||
normType
==
NORM_L1
||
normType
==
NORM_L2
);
typedef
NppStatus
(
*
npp_norm_diff_func_t
)(
const
Npp8u
*
pSrc1
,
int
nSrcStep1
,
const
Npp8u
*
pSrc2
,
int
nSrcStep2
,
NppiSize
oSizeROI
,
Npp64f
*
pRetVal
);
static
const
npp_norm_diff_func_t
npp_norm_diff_func
[]
=
{
nppiNormDiff_Inf_8u_C1R
,
nppiNormDiff_L1_8u_C1R
,
nppiNormDiff_L2_8u_C1R
};
NppiSize
sz
;
sz
.
width
=
src1
.
cols
;
sz
.
height
=
src1
.
rows
;
int
funcIdx
=
normType
>>
1
;
double
retVal
;
nppSafeCall
(
npp_norm_diff_func
[
funcIdx
](
src1
.
ptr
<
Npp8u
>
(),
src1
.
step
,
src2
.
ptr
<
Npp8u
>
(),
src2
.
step
,
sz
,
&
retVal
)
);
return
retVal
;
}
////////////////////////////////////////////////////////////////////////
// flip
...
...
@@ -193,305 +124,6 @@ void cv::gpu::flip(const GpuMat& src, GpuMat& dst, int flipCode)
}
}
////////////////////////////////////////////////////////////////////////
// sum
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
{
template
<
typename
T
>
void
sum_caller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
template
<
typename
T
>
void
sum_multipass_caller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
template
<
typename
T
>
void
sqsum_caller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
template
<
typename
T
>
void
sqsum_multipass_caller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
namespace
sum
{
void
get_buf_size_required
(
int
cols
,
int
rows
,
int
cn
,
int
&
bufcols
,
int
&
bufrows
);
}
}}}
Scalar
cv
::
gpu
::
sum
(
const
GpuMat
&
src
)
{
GpuMat
buf
;
return
sum
(
src
,
buf
);
}
Scalar
cv
::
gpu
::
sum
(
const
GpuMat
&
src
,
GpuMat
&
buf
)
{
using
namespace
mathfunc
;
typedef
void
(
*
Caller
)(
const
DevMem2D
,
PtrStep
,
double
*
,
int
);
static
const
Caller
callers
[
2
][
7
]
=
{
{
sum_multipass_caller
<
unsigned
char
>
,
sum_multipass_caller
<
char
>
,
sum_multipass_caller
<
unsigned
short
>
,
sum_multipass_caller
<
short
>
,
sum_multipass_caller
<
int
>
,
sum_multipass_caller
<
float
>
,
0
},
{
sum_caller
<
unsigned
char
>
,
sum_caller
<
char
>
,
sum_caller
<
unsigned
short
>
,
sum_caller
<
short
>
,
sum_caller
<
int
>
,
sum_caller
<
float
>
,
0
}
};
Size
bufSize
;
sum
::
get_buf_size_required
(
src
.
cols
,
src
.
rows
,
src
.
channels
(),
bufSize
.
width
,
bufSize
.
height
);
buf
.
create
(
bufSize
,
CV_8U
);
Caller
caller
=
callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
depth
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"sum: unsupported type"
);
double
result
[
4
];
caller
(
src
,
buf
,
result
,
src
.
channels
());
return
Scalar
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
}
Scalar
cv
::
gpu
::
sqrSum
(
const
GpuMat
&
src
)
{
GpuMat
buf
;
return
sqrSum
(
src
,
buf
);
}
Scalar
cv
::
gpu
::
sqrSum
(
const
GpuMat
&
src
,
GpuMat
&
buf
)
{
using
namespace
mathfunc
;
typedef
void
(
*
Caller
)(
const
DevMem2D
,
PtrStep
,
double
*
,
int
);
static
const
Caller
callers
[
2
][
7
]
=
{
{
sqsum_multipass_caller
<
unsigned
char
>
,
sqsum_multipass_caller
<
char
>
,
sqsum_multipass_caller
<
unsigned
short
>
,
sqsum_multipass_caller
<
short
>
,
sqsum_multipass_caller
<
int
>
,
sqsum_multipass_caller
<
float
>
,
0
},
{
sqsum_caller
<
unsigned
char
>
,
sqsum_caller
<
char
>
,
sqsum_caller
<
unsigned
short
>
,
sqsum_caller
<
short
>
,
sqsum_caller
<
int
>
,
sqsum_caller
<
float
>
,
0
}
};
Size
bufSize
;
sum
::
get_buf_size_required
(
src
.
cols
,
src
.
rows
,
src
.
channels
(),
bufSize
.
width
,
bufSize
.
height
);
buf
.
create
(
bufSize
,
CV_8U
);
Caller
caller
=
callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
depth
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"sqrSum: unsupported type"
);
double
result
[
4
];
caller
(
src
,
buf
,
result
,
src
.
channels
());
return
Scalar
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
}
////////////////////////////////////////////////////////////////////////
// minMax
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
{
namespace
minmax
{
void
get_buf_size_required
(
int
cols
,
int
rows
,
int
elem_size
,
int
&
bufcols
,
int
&
bufrows
);
template
<
typename
T
>
void
min_max_caller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
template
<
typename
T
>
void
min_max_mask_caller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
template
<
typename
T
>
void
min_max_multipass_caller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
template
<
typename
T
>
void
min_max_mask_multipass_caller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
}}}}
void
cv
::
gpu
::
minMax
(
const
GpuMat
&
src
,
double
*
minVal
,
double
*
maxVal
,
const
GpuMat
&
mask
)
{
GpuMat
buf
;
minMax
(
src
,
minVal
,
maxVal
,
mask
,
buf
);
}
void
cv
::
gpu
::
minMax
(
const
GpuMat
&
src
,
double
*
minVal
,
double
*
maxVal
,
const
GpuMat
&
mask
,
GpuMat
&
buf
)
{
using
namespace
mathfunc
::
minmax
;
typedef
void
(
*
Caller
)(
const
DevMem2D
,
double
*
,
double
*
,
PtrStep
);
typedef
void
(
*
MaskedCaller
)(
const
DevMem2D
,
const
PtrStep
,
double
*
,
double
*
,
PtrStep
);
static
const
Caller
callers
[
2
][
7
]
=
{
{
min_max_multipass_caller
<
unsigned
char
>
,
min_max_multipass_caller
<
char
>
,
min_max_multipass_caller
<
unsigned
short
>
,
min_max_multipass_caller
<
short
>
,
min_max_multipass_caller
<
int
>
,
min_max_multipass_caller
<
float
>
,
0
},
{
min_max_caller
<
unsigned
char
>
,
min_max_caller
<
char
>
,
min_max_caller
<
unsigned
short
>
,
min_max_caller
<
short
>
,
min_max_caller
<
int
>
,
min_max_caller
<
float
>
,
min_max_caller
<
double
>
}
};
static
const
MaskedCaller
masked_callers
[
2
][
7
]
=
{
{
min_max_mask_multipass_caller
<
unsigned
char
>
,
min_max_mask_multipass_caller
<
char
>
,
min_max_mask_multipass_caller
<
unsigned
short
>
,
min_max_mask_multipass_caller
<
short
>
,
min_max_mask_multipass_caller
<
int
>
,
min_max_mask_multipass_caller
<
float
>
,
0
},
{
min_max_mask_caller
<
unsigned
char
>
,
min_max_mask_caller
<
char
>
,
min_max_mask_caller
<
unsigned
short
>
,
min_max_mask_caller
<
short
>
,
min_max_mask_caller
<
int
>
,
min_max_mask_caller
<
float
>
,
min_max_mask_caller
<
double
>
}
};
CV_Assert
(
src
.
channels
()
==
1
);
CV_Assert
(
mask
.
empty
()
||
(
mask
.
type
()
==
CV_8U
&&
src
.
size
()
==
mask
.
size
()));
CV_Assert
(
src
.
type
()
!=
CV_64F
||
hasNativeDoubleSupport
(
getDevice
()));
double
minVal_
;
if
(
!
minVal
)
minVal
=
&
minVal_
;
double
maxVal_
;
if
(
!
maxVal
)
maxVal
=
&
maxVal_
;
Size
bufSize
;
get_buf_size_required
(
src
.
cols
,
src
.
rows
,
src
.
elemSize
(),
bufSize
.
width
,
bufSize
.
height
);
buf
.
create
(
bufSize
,
CV_8U
);
if
(
mask
.
empty
())
{
Caller
caller
=
callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
type
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"minMax: unsupported type"
);
caller
(
src
,
minVal
,
maxVal
,
buf
);
}
else
{
MaskedCaller
caller
=
masked_callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
type
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"minMax: unsupported type"
);
caller
(
src
,
mask
,
minVal
,
maxVal
,
buf
);
}
}
////////////////////////////////////////////////////////////////////////
// minMaxLoc
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
{
namespace
minmaxloc
{
void
get_buf_size_required
(
int
cols
,
int
rows
,
int
elem_size
,
int
&
b1cols
,
int
&
b1rows
,
int
&
b2cols
,
int
&
b2rows
);
template
<
typename
T
>
void
min_max_loc_caller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
int
minloc
[
2
],
int
maxloc
[
2
],
PtrStep
valbuf
,
PtrStep
locbuf
);
template
<
typename
T
>
void
min_max_loc_mask_caller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
int
minloc
[
2
],
int
maxloc
[
2
],
PtrStep
valbuf
,
PtrStep
locbuf
);
template
<
typename
T
>
void
min_max_loc_multipass_caller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
int
minloc
[
2
],
int
maxloc
[
2
],
PtrStep
valbuf
,
PtrStep
locbuf
);
template
<
typename
T
>
void
min_max_loc_mask_multipass_caller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
int
minloc
[
2
],
int
maxloc
[
2
],
PtrStep
valbuf
,
PtrStep
locbuf
);
}}}}
void
cv
::
gpu
::
minMaxLoc
(
const
GpuMat
&
src
,
double
*
minVal
,
double
*
maxVal
,
Point
*
minLoc
,
Point
*
maxLoc
,
const
GpuMat
&
mask
)
{
GpuMat
valbuf
,
locbuf
;
minMaxLoc
(
src
,
minVal
,
maxVal
,
minLoc
,
maxLoc
,
mask
,
valbuf
,
locbuf
);
}
void
cv
::
gpu
::
minMaxLoc
(
const
GpuMat
&
src
,
double
*
minVal
,
double
*
maxVal
,
Point
*
minLoc
,
Point
*
maxLoc
,
const
GpuMat
&
mask
,
GpuMat
&
valbuf
,
GpuMat
&
locbuf
)
{
using
namespace
mathfunc
::
minmaxloc
;
typedef
void
(
*
Caller
)(
const
DevMem2D
,
double
*
,
double
*
,
int
[
2
],
int
[
2
],
PtrStep
,
PtrStep
);
typedef
void
(
*
MaskedCaller
)(
const
DevMem2D
,
const
PtrStep
,
double
*
,
double
*
,
int
[
2
],
int
[
2
],
PtrStep
,
PtrStep
);
static
const
Caller
callers
[
2
][
7
]
=
{
{
min_max_loc_multipass_caller
<
unsigned
char
>
,
min_max_loc_multipass_caller
<
char
>
,
min_max_loc_multipass_caller
<
unsigned
short
>
,
min_max_loc_multipass_caller
<
short
>
,
min_max_loc_multipass_caller
<
int
>
,
min_max_loc_multipass_caller
<
float
>
,
0
},
{
min_max_loc_caller
<
unsigned
char
>
,
min_max_loc_caller
<
char
>
,
min_max_loc_caller
<
unsigned
short
>
,
min_max_loc_caller
<
short
>
,
min_max_loc_caller
<
int
>
,
min_max_loc_caller
<
float
>
,
min_max_loc_caller
<
double
>
}
};
static
const
MaskedCaller
masked_callers
[
2
][
7
]
=
{
{
min_max_loc_mask_multipass_caller
<
unsigned
char
>
,
min_max_loc_mask_multipass_caller
<
char
>
,
min_max_loc_mask_multipass_caller
<
unsigned
short
>
,
min_max_loc_mask_multipass_caller
<
short
>
,
min_max_loc_mask_multipass_caller
<
int
>
,
min_max_loc_mask_multipass_caller
<
float
>
,
0
},
{
min_max_loc_mask_caller
<
unsigned
char
>
,
min_max_loc_mask_caller
<
char
>
,
min_max_loc_mask_caller
<
unsigned
short
>
,
min_max_loc_mask_caller
<
short
>
,
min_max_loc_mask_caller
<
int
>
,
min_max_loc_mask_caller
<
float
>
,
min_max_loc_mask_caller
<
double
>
}
};
CV_Assert
(
src
.
channels
()
==
1
);
CV_Assert
(
mask
.
empty
()
||
(
mask
.
type
()
==
CV_8U
&&
src
.
size
()
==
mask
.
size
()));
CV_Assert
(
src
.
type
()
!=
CV_64F
||
hasNativeDoubleSupport
(
getDevice
()));
double
minVal_
;
if
(
!
minVal
)
minVal
=
&
minVal_
;
double
maxVal_
;
if
(
!
maxVal
)
maxVal
=
&
maxVal_
;
int
minLoc_
[
2
];
int
maxLoc_
[
2
];
Size
valbuf_size
,
locbuf_size
;
get_buf_size_required
(
src
.
cols
,
src
.
rows
,
src
.
elemSize
(),
valbuf_size
.
width
,
valbuf_size
.
height
,
locbuf_size
.
width
,
locbuf_size
.
height
);
valbuf
.
create
(
valbuf_size
,
CV_8U
);
locbuf
.
create
(
locbuf_size
,
CV_8U
);
if
(
mask
.
empty
())
{
Caller
caller
=
callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
type
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"minMaxLoc: unsupported type"
);
caller
(
src
,
minVal
,
maxVal
,
minLoc_
,
maxLoc_
,
valbuf
,
locbuf
);
}
else
{
MaskedCaller
caller
=
masked_callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
type
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"minMaxLoc: unsupported type"
);
caller
(
src
,
mask
,
minVal
,
maxVal
,
minLoc_
,
maxLoc_
,
valbuf
,
locbuf
);
}
if
(
minLoc
)
{
minLoc
->
x
=
minLoc_
[
0
];
minLoc
->
y
=
minLoc_
[
1
];
}
if
(
maxLoc
)
{
maxLoc
->
x
=
maxLoc_
[
0
];
maxLoc
->
y
=
maxLoc_
[
1
];
}
}
////////////////////////////////////////////////////////////////////////
// Count non zero
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
{
namespace
countnonzero
{
void
get_buf_size_required
(
int
cols
,
int
rows
,
int
&
bufcols
,
int
&
bufrows
);
template
<
typename
T
>
int
count_non_zero_caller
(
const
DevMem2D
src
,
PtrStep
buf
);
template
<
typename
T
>
int
count_non_zero_multipass_caller
(
const
DevMem2D
src
,
PtrStep
buf
);
}}}}
int
cv
::
gpu
::
countNonZero
(
const
GpuMat
&
src
)
{
GpuMat
buf
;
return
countNonZero
(
src
,
buf
);
}
int
cv
::
gpu
::
countNonZero
(
const
GpuMat
&
src
,
GpuMat
&
buf
)
{
using
namespace
mathfunc
::
countnonzero
;
typedef
int
(
*
Caller
)(
const
DevMem2D
src
,
PtrStep
buf
);
static
const
Caller
callers
[
2
][
7
]
=
{
{
count_non_zero_multipass_caller
<
unsigned
char
>
,
count_non_zero_multipass_caller
<
char
>
,
count_non_zero_multipass_caller
<
unsigned
short
>
,
count_non_zero_multipass_caller
<
short
>
,
count_non_zero_multipass_caller
<
int
>
,
count_non_zero_multipass_caller
<
float
>
,
0
},
{
count_non_zero_caller
<
unsigned
char
>
,
count_non_zero_caller
<
char
>
,
count_non_zero_caller
<
unsigned
short
>
,
count_non_zero_caller
<
short
>
,
count_non_zero_caller
<
int
>
,
count_non_zero_caller
<
float
>
,
count_non_zero_caller
<
double
>
}
};
CV_Assert
(
src
.
channels
()
==
1
);
CV_Assert
(
src
.
type
()
!=
CV_64F
||
hasNativeDoubleSupport
(
getDevice
()));
Size
buf_size
;
get_buf_size_required
(
src
.
cols
,
src
.
rows
,
buf_size
.
width
,
buf_size
.
height
);
buf
.
create
(
buf_size
,
CV_8U
);
Caller
caller
=
callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
type
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"countNonZero: unsupported type"
);
return
caller
(
src
,
buf
);
}
////////////////////////////////////////////////////////////////////////
// LUT
...
...
@@ -711,144 +343,4 @@ void cv::gpu::polarToCart(const GpuMat& magnitude, const GpuMat& angle, GpuMat&
}
//////////////////////////////////////////////////////////////////////////////
// min/max
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
{
template
<
typename
T
>
void
min_gpu
(
const
DevMem2D_
<
T
>&
src1
,
const
DevMem2D_
<
T
>&
src2
,
const
DevMem2D_
<
T
>&
dst
,
cudaStream_t
stream
);
template
<
typename
T
>
void
max_gpu
(
const
DevMem2D_
<
T
>&
src1
,
const
DevMem2D_
<
T
>&
src2
,
const
DevMem2D_
<
T
>&
dst
,
cudaStream_t
stream
);
template
<
typename
T
>
void
min_gpu
(
const
DevMem2D_
<
T
>&
src1
,
double
src2
,
const
DevMem2D_
<
T
>&
dst
,
cudaStream_t
stream
);
template
<
typename
T
>
void
max_gpu
(
const
DevMem2D_
<
T
>&
src1
,
double
src2
,
const
DevMem2D_
<
T
>&
dst
,
cudaStream_t
stream
);
}}}
namespace
{
template
<
typename
T
>
void
min_caller
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
)
{
CV_Assert
(
src1
.
size
()
==
src2
.
size
()
&&
src1
.
type
()
==
src2
.
type
());
dst
.
create
(
src1
.
size
(),
src1
.
type
());
mathfunc
::
min_gpu
<
T
>
(
src1
.
reshape
(
1
),
src2
.
reshape
(
1
),
dst
.
reshape
(
1
),
stream
);
}
template
<
typename
T
>
void
min_caller
(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
)
{
dst
.
create
(
src1
.
size
(),
src1
.
type
());
mathfunc
::
min_gpu
<
T
>
(
src1
.
reshape
(
1
),
src2
,
dst
.
reshape
(
1
),
stream
);
}
template
<
typename
T
>
void
max_caller
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
)
{
CV_Assert
(
src1
.
size
()
==
src2
.
size
()
&&
src1
.
type
()
==
src2
.
type
());
dst
.
create
(
src1
.
size
(),
src1
.
type
());
mathfunc
::
max_gpu
<
T
>
(
src1
.
reshape
(
1
),
src2
.
reshape
(
1
),
dst
.
reshape
(
1
),
stream
);
}
template
<
typename
T
>
void
max_caller
(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
)
{
dst
.
create
(
src1
.
size
(),
src1
.
type
());
mathfunc
::
max_gpu
<
T
>
(
src1
.
reshape
(
1
),
src2
,
dst
.
reshape
(
1
),
stream
);
}
}
void
cv
::
gpu
::
min
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
)
{
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
min_caller
<
uchar
>
,
min_caller
<
char
>
,
min_caller
<
ushort
>
,
min_caller
<
short
>
,
min_caller
<
int
>
,
min_caller
<
float
>
,
min_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
0
);
}
void
cv
::
gpu
::
min
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
const
Stream
&
stream
)
{
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
min_caller
<
uchar
>
,
min_caller
<
char
>
,
min_caller
<
ushort
>
,
min_caller
<
short
>
,
min_caller
<
int
>
,
min_caller
<
float
>
,
min_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
StreamAccessor
::
getStream
(
stream
));
}
void
cv
::
gpu
::
min
(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
)
{
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
min_caller
<
uchar
>
,
min_caller
<
char
>
,
min_caller
<
ushort
>
,
min_caller
<
short
>
,
min_caller
<
int
>
,
min_caller
<
float
>
,
min_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
0
);
}
void
cv
::
gpu
::
min
(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
const
Stream
&
stream
)
{
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
min_caller
<
uchar
>
,
min_caller
<
char
>
,
min_caller
<
ushort
>
,
min_caller
<
short
>
,
min_caller
<
int
>
,
min_caller
<
float
>
,
min_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
StreamAccessor
::
getStream
(
stream
));
}
void
cv
::
gpu
::
max
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
)
{
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
max_caller
<
uchar
>
,
max_caller
<
char
>
,
max_caller
<
ushort
>
,
max_caller
<
short
>
,
max_caller
<
int
>
,
max_caller
<
float
>
,
max_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
0
);
}
void
cv
::
gpu
::
max
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
const
Stream
&
stream
)
{
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
max_caller
<
uchar
>
,
max_caller
<
char
>
,
max_caller
<
ushort
>
,
max_caller
<
short
>
,
max_caller
<
int
>
,
max_caller
<
float
>
,
max_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
StreamAccessor
::
getStream
(
stream
));
}
void
cv
::
gpu
::
max
(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
)
{
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
max_caller
<
uchar
>
,
max_caller
<
char
>
,
max_caller
<
ushort
>
,
max_caller
<
short
>
,
max_caller
<
int
>
,
max_caller
<
float
>
,
max_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
0
);
}
void
cv
::
gpu
::
max
(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
const
Stream
&
stream
)
{
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
max_caller
<
uchar
>
,
max_caller
<
char
>
,
max_caller
<
ushort
>
,
max_caller
<
short
>
,
max_caller
<
int
>
,
max_caller
<
float
>
,
max_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
StreamAccessor
::
getStream
(
stream
));
}
#endif
/* !defined (HAVE_CUDA) */
modules/gpu/src/cuda/element_operations.cu
View file @
df852937
...
...
@@ -345,4 +345,127 @@ namespace cv { namespace gpu { namespace mathfunc
template void bitwiseMaskXorCaller<ushort>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
template void bitwiseMaskXorCaller<uint>(int, int, int, const PtrStep, const PtrStep, const PtrStep, PtrStep, cudaStream_t);
//////////////////////////////////////////////////////////////////////////
// min/max
struct MinOp
{
template <typename T>
__device__ T operator()(T a, T b)
{
return min(a, b);
}
__device__ float operator()(float a, float b)
{
return fmin(a, b);
}
__device__ double operator()(double a, double b)
{
return fmin(a, b);
}
};
struct MaxOp
{
template <typename T>
__device__ T operator()(T a, T b)
{
return max(a, b);
}
__device__ float operator()(float a, float b)
{
return fmax(a, b);
}
__device__ double operator()(double a, double b)
{
return fmax(a, b);
}
};
struct ScalarMinOp
{
double s;
explicit ScalarMinOp(double s_) : s(s_) {}
template <typename T>
__device__ T operator()(T a)
{
return saturate_cast<T>(fmin((double)a, s));
}
};
struct ScalarMaxOp
{
double s;
explicit ScalarMaxOp(double s_) : s(s_) {}
template <typename T>
__device__ T operator()(T a)
{
return saturate_cast<T>(fmax((double)a, s));
}
};
template <typename T>
void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
MinOp op;
transform(src1, src2, dst, op, stream);
}
template void min_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
template void min_gpu<char >(const DevMem2D_<char>& src1, const DevMem2D_<char>& src2, const DevMem2D_<char>& dst, cudaStream_t stream);
template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
template void min_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
template void min_gpu<int >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
template void min_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);
template void min_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
template <typename T>
void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
MaxOp op;
transform(src1, src2, dst, op, stream);
}
template void max_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
template void max_gpu<char >(const DevMem2D_<char>& src1, const DevMem2D_<char>& src2, const DevMem2D_<char>& dst, cudaStream_t stream);
template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
template void max_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
template void max_gpu<int >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
template void max_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);
template void max_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
template <typename T>
void min_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
ScalarMinOp op(src2);
transform(src1, dst, op, stream);
}
template void min_gpu<uchar >(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream);
template void min_gpu<char >(const DevMem2D_<char>& src1, double src2, const DevMem2D_<char>& dst, cudaStream_t stream);
template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, double src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
template void min_gpu<short >(const DevMem2D_<short>& src1, double src2, const DevMem2D_<short>& dst, cudaStream_t stream);
template void min_gpu<int >(const DevMem2D_<int>& src1, double src2, const DevMem2D_<int>& dst, cudaStream_t stream);
template void min_gpu<float >(const DevMem2D_<float>& src1, double src2, const DevMem2D_<float>& dst, cudaStream_t stream);
template void min_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
template <typename T>
void max_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
ScalarMaxOp op(src2);
transform(src1, dst, op, stream);
}
template void max_gpu<uchar >(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream);
template void max_gpu<char >(const DevMem2D_<char>& src1, double src2, const DevMem2D_<char>& dst, cudaStream_t stream);
template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, double src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
template void max_gpu<short >(const DevMem2D_<short>& src1, double src2, const DevMem2D_<short>& dst, cudaStream_t stream);
template void max_gpu<int >(const DevMem2D_<int>& src1, double src2, const DevMem2D_<int>& dst, cudaStream_t stream);
template void max_gpu<float >(const DevMem2D_<float>& src1, double src2, const DevMem2D_<float>& dst, cudaStream_t stream);
template void max_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
}}}
modules/gpu/src/cuda/mathfunc.cu
View file @
df852937
...
...
@@ -58,49 +58,6 @@ using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace mathfunc
{
template <int size, typename T>
__device__ void sum_in_smem(volatile T* data, const uint tid)
{
T sum = data[tid];
if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); }
if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); }
if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); }
if (tid < 32)
{
if (size >= 64) data[tid] = sum = sum + data[tid + 32];
if (size >= 32) data[tid] = sum = sum + data[tid + 16];
if (size >= 16) data[tid] = sum = sum + data[tid + 8];
if (size >= 8) data[tid] = sum = sum + data[tid + 4];
if (size >= 4) data[tid] = sum = sum + data[tid + 2];
if (size >= 2) data[tid] = sum = sum + data[tid + 1];
}
}
struct Mask8U
{
explicit Mask8U(PtrStep mask): mask(mask) {}
__device__ bool operator()(int y, int x) const
{
return mask.ptr(y)[x];
}
PtrStep mask;
};
struct MaskTrue
{
__device__ bool operator()(int y, int x) const
{
return true;
}
};
struct Nothing
{
static __device__ void calc(int, int, float, float, float*, size_t, float)
...
...
@@ -259,1676 +216,42 @@ namespace cv { namespace gpu { namespace mathfunc
}
//////////////////////////////////////////////////////////////////////////////
// Min max
// To avoid shared bank conflicts we convert each value into value of
// appropriate type (32 bits minimum)
template <typename T> struct MinMaxTypeTraits {};
template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
template <> struct MinMaxTypeTraits<char> { typedef int best_type; };
template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
namespace minmax
{
__constant__ int ctwidth;
__constant__ int ctheight;
// Global counter of blocks finished its work
__device__ uint blocks_finished = 0;
// Estimates good thread configuration
// - threads variable satisfies to threads.x * threads.y == 256
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid.x = min(grid.x, threads.x);
grid.y = min(grid.y, threads.y);
}
// Returns required buffer sizes
void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows)
{
dim3 threads, grid;
estimate_thread_cfg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * elem_size;
bufrows = 2;
}
// Estimates device constants which are used in the kernels using specified thread configuration
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth)));
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight)));
}
// Does min and max in shared memory
template <typename T>
__device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval)
{
minval[tid] = min(minval[tid], minval[tid + offset]);
maxval[tid] = max(maxval[tid], maxval[tid + offset]);
}
template <int size, typename T>
__device__ void find_min_max_in_smem(volatile T* minval, volatile T* maxval, const uint tid)
{
if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); }
if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); } __syncthreads(); }
if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval); } __syncthreads(); }
if (tid < 32)
{
if (size >= 64) merge(tid, 32, minval, maxval);
if (size >= 32) merge(tid, 16, minval, maxval);
if (size >= 16) merge(tid, 8, minval, maxval);
if (size >= 8) merge(tid, 4, minval, maxval);
if (size >= 4) merge(tid, 2, minval, maxval);
if (size >= 2) merge(tid, 1, minval, maxval);
}
}
template <int nthreads, typename T, typename Mask>
__global__ void min_max_kernel(const DevMem2D src, Mask mask, T* minval, T* maxval)
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
__shared__ best_type smaxval[nthreads];
uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
T mymin = numeric_limits_gpu<T>::max();
T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() : numeric_limits_gpu<T>::min();
uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
for (uint y = y0; y < y_end; y += blockDim.y)
{
const T* src_row = (const T*)src.ptr(y);
for (uint x = x0; x < x_end; x += blockDim.x)
{
T val = src_row[x];
if (mask(y, x))
{
mymin = min(mymin, val);
mymax = max(mymax, val);
}
}
}
sminval[tid] = mymin;
smaxval[tid] = mymax;
__syncthreads();
find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0)
{
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
}
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = ticket == gridDim.x * gridDim.y - 1;
}
__syncthreads();
if (is_last)
{
uint idx = min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx];
__syncthreads();
find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0)
{
minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0];
blocks_finished = 0;
}
}
#else
if (tid == 0)
{
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
}
#endif
}
template <typename T>
void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
min_max_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
}
template void min_max_mask_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<double>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template <typename T>
void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
min_max_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
}
template void min_max_caller<uchar>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<char>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<ushort>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<short>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<int>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<float>(const DevMem2D, double*,double*, PtrStep);
template void min_max_caller<double>(const DevMem2D, double*, double*, PtrStep);
template <int nthreads, typename T>
__global__ void min_max_pass2_kernel(T* minval, T* maxval, int size)
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
__shared__ best_type smaxval[nthreads];
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
uint idx = min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx];
__syncthreads();
find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0)
{
minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0];
}
}
template <typename T>
void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
min_max_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
min_max_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
}
template void min_max_mask_multipass_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template <typename T>
void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
min_max_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
min_max_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
}
template void min_max_multipass_caller<uchar>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<char>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<ushort>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<short>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<int>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<float>(const DevMem2D, double*, double*, PtrStep);
} // namespace minmax
///////////////////////////////////////////////////////////////////////////////
// minMaxLoc
namespace minmaxloc {
__constant__ int ctwidth;
__constant__ int ctheight;
// Global counter of blocks finished its work
__device__ uint blocks_finished = 0;
// Estimates good thread configuration
// - threads variable satisfies to threads.x * threads.y == 256
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid.x = min(grid.x, threads.x);
grid.y = min(grid.y, threads.y);
}
// Returns required buffer sizes
void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols,
int& b1rows, int& b2cols, int& b2rows)
{
dim3 threads, grid;
estimate_thread_cfg(cols, rows, threads, grid);
b1cols = grid.x * grid.y * elem_size; // For values
b1rows = 2;
b2cols = grid.x * grid.y * sizeof(int); // For locations
b2rows = 2;
}
// Estimates device constants which are used in the kernels using specified thread configuration
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth)));
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight)));
}
template <typename T>
__device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval,
volatile uint* minloc, volatile uint* maxloc)
{
T val = minval[tid + offset];
if (val < minval[tid])
{
minval[tid] = val;
minloc[tid] = minloc[tid + offset];
}
val = maxval[tid + offset];
if (val > maxval[tid])
{
maxval[tid] = val;
maxloc[tid] = maxloc[tid + offset];
}
}
template <int size, typename T>
__device__ void find_min_max_loc_in_smem(volatile T* minval, volatile T* maxval, volatile uint* minloc,
volatile uint* maxloc, const uint tid)
{
if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); }
if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); } __syncthreads(); }
if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval, minloc, maxloc); } __syncthreads(); }
if (tid < 32)
{
if (size >= 64) merge(tid, 32, minval, maxval, minloc, maxloc);
if (size >= 32) merge(tid, 16, minval, maxval, minloc, maxloc);
if (size >= 16) merge(tid, 8, minval, maxval, minloc, maxloc);
if (size >= 8) merge(tid, 4, minval, maxval, minloc, maxloc);
if (size >= 4) merge(tid, 2, minval, maxval, minloc, maxloc);
if (size >= 2) merge(tid, 1, minval, maxval, minloc, maxloc);
}
}
template <int nthreads, typename T, typename Mask>
__global__ void min_max_loc_kernel(const DevMem2D src, Mask mask, T* minval, T* maxval,
uint* minloc, uint* maxloc)
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
__shared__ best_type smaxval[nthreads];
__shared__ uint sminloc[nthreads];
__shared__ uint smaxloc[nthreads];
uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
T mymin = numeric_limits_gpu<T>::max();
T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() : numeric_limits_gpu<T>::min();
uint myminloc = 0;
uint mymaxloc = 0;
uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// transpose
for (uint y = y0; y < y_end; y += blockDim.y)
{
const T* ptr = (const T*)src.ptr(y);
for (uint x = x0; x < x_end; x += blockDim.x)
{
if (mask(y, x))
__global__ void transpose(const DevMem2Di src, PtrStepi dst)
{
T val = ptr[x];
if (val <= mymin) { mymin = val; myminloc = y * src.cols + x; }
if (val >= mymax) { mymax = val; mymaxloc = y * src.cols + x; }
}
}
}
sminval[tid] = mymin;
smaxval[tid] = mymax;
sminloc[tid] = myminloc;
smaxloc[tid] = mymaxloc;
__syncthreads();
find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
__shared__ int s_mem[16 * 17];
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int smem_idx = threadIdx.y * blockDim.x + threadIdx.x + threadIdx.y;
if (tid == 0
)
if (y < src.rows && x < src.cols
)
{
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];
maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = ticket == gridDim.x * gridDim.y - 1;
s_mem[smem_idx] = src.ptr(y)[x];
}
__syncthreads();
if (is_last)
{
uint idx = min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx];
sminloc[tid] = minloc[idx];
smaxloc[tid] = maxloc[idx];
__syncthreads();
find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
if (tid == 0)
{
minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0];
minloc[0] = sminloc[0];
maxloc[0] = smaxloc[0];
blocks_finished = 0;
}
}
#else
if (tid == 0)
{
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];
maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];
}
#endif
}
template <typename T>
void min_max_loc_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
uint minloc_, maxloc_;
cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void min_max_loc_mask_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<double>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template <typename T>
void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
uint minloc_, maxloc_;
cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void min_max_loc_caller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<double>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
// This kernel will be used only when compute capability is 1.0
template <int nthreads, typename T>
__global__ void min_max_loc_pass2_kernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size)
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
__shared__ best_type smaxval[nthreads];
__shared__ uint sminloc[nthreads];
__shared__ uint smaxloc[nthreads];
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
uint idx = min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx];
sminloc[tid] = minloc[idx];
smaxloc[tid] = maxloc[idx];
__syncthreads();
smem_idx = threadIdx.x * blockDim.x + threadIdx.y + threadIdx.x;
find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
x = blockIdx.y * blockDim.x + threadIdx.x;
y = blockIdx.x * blockDim.y + threadIdx.y;
if (tid == 0)
if (y < src.cols && x < src.rows)
{
minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0];
minloc[0] = sminloc[0];
maxloc[0] = smaxloc[0];
}
dst.ptr(y)[x] = s_mem[smem_idx];
}
template <typename T>
void min_max_loc_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
uint minloc_, maxloc_;
cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void min_max_loc_mask_multipass_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template <typename T>
void min_max_loc_multipass_caller(const DevMem2D src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
void transpose_gpu(const DevMem2Di& src, const DevMem2Di& dst)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
dim3 threads(16, 16, 1);
dim3 grid(divUp(src.cols, 16), divUp(src.rows, 16), 1);
uint minloc_, maxloc_;
cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
transpose<<<grid, threads>>>(src, dst);
cudaSafeCall( cudaThreadSynchronize() );
}
template void min_max_loc_multipass_caller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
} // namespace minmaxloc
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// countNonZero
namespace countnonzero
{
__constant__ int ctwidth;
__constant__ int ctheight;
__device__ uint blocks_finished = 0;
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid.x = min(grid.x, threads.x);
grid.y = min(grid.y, threads.y);
}
void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows)
{
dim3 threads, grid;
estimate_thread_cfg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * sizeof(int);
bufrows = 1;
}
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
}
template <int nthreads, typename T>
__global__ void count_non_zero_kernel(const DevMem2D src, volatile uint* count)
{
__shared__ uint scount[nthreads];
uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
uint cnt = 0;
for (uint y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
for (uint x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
cnt += ptr[x0 + x * blockDim.x] != 0;
}
scount[tid] = cnt;
__syncthreads();
sum_in_smem<nthreads, uint>(scount, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = ticket == gridDim.x * gridDim.y - 1;
}
__syncthreads();
if (is_last)
{
scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;
__syncthreads();
sum_in_smem<nthreads, uint>(scount, tid);
if (tid == 0)
{
count[0] = scount[0];
blocks_finished = 0;
}
}
#else
if (tid == 0) count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
#endif
}
template <typename T>
int count_non_zero_caller(const DevMem2D src, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
uint* count_buf = (uint*)buf.ptr(0);
count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
cudaSafeCall(cudaThreadSynchronize());
uint count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
return count;
}
template int count_non_zero_caller<uchar>(const DevMem2D, PtrStep);
template int count_non_zero_caller<char>(const DevMem2D, PtrStep);
template int count_non_zero_caller<ushort>(const DevMem2D, PtrStep);
template int count_non_zero_caller<short>(const DevMem2D, PtrStep);
template int count_non_zero_caller<int>(const DevMem2D, PtrStep);
template int count_non_zero_caller<float>(const DevMem2D, PtrStep);
template int count_non_zero_caller<double>(const DevMem2D, PtrStep);
template <int nthreads, typename T>
__global__ void count_non_zero_pass2_kernel(uint* count, int size)
{
__shared__ uint scount[nthreads];
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
scount[tid] = tid < size ? count[tid] : 0;
__syncthreads();
sum_in_smem<nthreads, uint>(scount, tid);
if (tid == 0)
count[0] = scount[0];
}
template <typename T>
int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
uint* count_buf = (uint*)buf.ptr(0);
count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
count_non_zero_pass2_kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
uint count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
return count;
}
template int count_non_zero_multipass_caller<uchar>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<char>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<ushort>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<short>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<int>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<float>(const DevMem2D, PtrStep);
} // namespace countnonzero
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// transpose
__global__ void transpose(const DevMem2Di src, PtrStepi dst)
{
__shared__ int s_mem[16 * 17];
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int smem_idx = threadIdx.y * blockDim.x + threadIdx.x + threadIdx.y;
if (y < src.rows && x < src.cols)
{
s_mem[smem_idx] = src.ptr(y)[x];
}
__syncthreads();
smem_idx = threadIdx.x * blockDim.x + threadIdx.y + threadIdx.x;
x = blockIdx.y * blockDim.x + threadIdx.x;
y = blockIdx.x * blockDim.y + threadIdx.y;
if (y < src.cols && x < src.rows)
{
dst.ptr(y)[x] = s_mem[smem_idx];
}
}
void transpose_gpu(const DevMem2Di& src, const DevMem2Di& dst)
{
dim3 threads(16, 16, 1);
dim3 grid(divUp(src.cols, 16), divUp(src.rows, 16), 1);
transpose<<<grid, threads>>>(src, dst);
cudaSafeCall( cudaThreadSynchronize() );
}
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// min/max
struct MinOp
{
template <typename T>
__device__ T operator()(T a, T b)
{
return min(a, b);
}
__device__ float operator()(float a, float b)
{
return fmin(a, b);
}
__device__ double operator()(double a, double b)
{
return fmin(a, b);
}
};
struct MaxOp
{
template <typename T>
__device__ T operator()(T a, T b)
{
return max(a, b);
}
__device__ float operator()(float a, float b)
{
return fmax(a, b);
}
__device__ double operator()(double a, double b)
{
return fmax(a, b);
}
};
struct ScalarMinOp
{
double s;
explicit ScalarMinOp(double s_) : s(s_) {}
template <typename T>
__device__ T operator()(T a)
{
return saturate_cast<T>(fmin((double)a, s));
}
};
struct ScalarMaxOp
{
double s;
explicit ScalarMaxOp(double s_) : s(s_) {}
template <typename T>
__device__ T operator()(T a)
{
return saturate_cast<T>(fmax((double)a, s));
}
};
template <typename T>
void min_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
MinOp op;
transform(src1, src2, dst, op, stream);
}
template void min_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
template void min_gpu<char >(const DevMem2D_<char>& src1, const DevMem2D_<char>& src2, const DevMem2D_<char>& dst, cudaStream_t stream);
template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
template void min_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
template void min_gpu<int >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
template void min_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);
template void min_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
template <typename T>
void max_gpu(const DevMem2D_<T>& src1, const DevMem2D_<T>& src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
MaxOp op;
transform(src1, src2, dst, op, stream);
}
template void max_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
template void max_gpu<char >(const DevMem2D_<char>& src1, const DevMem2D_<char>& src2, const DevMem2D_<char>& dst, cudaStream_t stream);
template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
template void max_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
template void max_gpu<int >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
template void max_gpu<float >(const DevMem2D_<float>& src1, const DevMem2D_<float>& src2, const DevMem2D_<float>& dst, cudaStream_t stream);
template void max_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
template <typename T>
void min_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
ScalarMinOp op(src2);
transform(src1, dst, op, stream);
}
template void min_gpu<uchar >(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream);
template void min_gpu<char >(const DevMem2D_<char>& src1, double src2, const DevMem2D_<char>& dst, cudaStream_t stream);
template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, double src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
template void min_gpu<short >(const DevMem2D_<short>& src1, double src2, const DevMem2D_<short>& dst, cudaStream_t stream);
template void min_gpu<int >(const DevMem2D_<int>& src1, double src2, const DevMem2D_<int>& dst, cudaStream_t stream);
template void min_gpu<float >(const DevMem2D_<float>& src1, double src2, const DevMem2D_<float>& dst, cudaStream_t stream);
template void min_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
template <typename T>
void max_gpu(const DevMem2D_<T>& src1, double src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
ScalarMaxOp op(src2);
transform(src1, dst, op, stream);
}
template void max_gpu<uchar >(const DevMem2D& src1, double src2, const DevMem2D& dst, cudaStream_t stream);
template void max_gpu<char >(const DevMem2D_<char>& src1, double src2, const DevMem2D_<char>& dst, cudaStream_t stream);
template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, double src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
template void max_gpu<short >(const DevMem2D_<short>& src1, double src2, const DevMem2D_<short>& dst, cudaStream_t stream);
template void max_gpu<int >(const DevMem2D_<int>& src1, double src2, const DevMem2D_<int>& dst, cudaStream_t stream);
template void max_gpu<float >(const DevMem2D_<float>& src1, double src2, const DevMem2D_<float>& dst, cudaStream_t stream);
template void max_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
//////////////////////////////////////////////////////////////////////////////
// Sum
namespace sum
{
template <typename T> struct SumType {};
template <> struct SumType<uchar> { typedef uint R; };
template <> struct SumType<char> { typedef int R; };
template <> struct SumType<ushort> { typedef uint R; };
template <> struct SumType<short> { typedef int R; };
template <> struct SumType<int> { typedef int R; };
template <> struct SumType<float> { typedef float R; };
template <> struct SumType<double> { typedef double R; };
template <typename R>
struct IdentityOp { static __device__ R call(R x) { return x; } };
template <typename R>
struct SqrOp { static __device__ R call(R x) { return x * x; } };
__constant__ int ctwidth;
__constant__ int ctheight;
__device__ uint blocks_finished = 0;
const int threads_x = 32;
const int threads_y = 8;
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, threads.x * threads.y),
divUp(rows, threads.y * threads.x));
grid.x = min(grid.x, threads.x);
grid.y = min(grid.y, threads.y);
}
void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows)
{
dim3 threads, grid;
estimate_thread_cfg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * sizeof(double) * cn;
bufrows = 1;
}
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
}
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum_kernel(const DevMem2D src, R* result)
{
__shared__ R smem[nthreads];
const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
R sum = 0;
for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
sum += Op::call(ptr[x0 + x * blockDim.x]);
}
smem[tid] = sum;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
result[bid] = smem[0];
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
if (tid == 0)
{
result[0] = smem[0];
blocks_finished = 0;
}
}
#else
if (tid == 0) result[bid] = smem[0];
#endif
}
template <typename T, typename R, int nthreads>
__global__ void sum_pass2_kernel(R* result, int size)
{
__shared__ R smem[nthreads];
int tid = threadIdx.y * blockDim.x + threadIdx.x;
smem[tid] = tid < size ? result[tid] : 0;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
if (tid == 0)
result[0] = smem[0];
}
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum_kernel_C2(const DevMem2D src, typename TypeVec<R, 2>::vec_t* result)
{
typedef typename TypeVec<T, 2>::vec_t SrcType;
typedef typename TypeVec<R, 2>::vec_t DstType;
__shared__ R smem[nthreads * 2];
const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
SrcType val;
DstType sum = VecTraits<DstType>::all(0);
for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
{
val = ptr[x0 + x * blockDim.x];
sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y));
}
}
smem[tid] = sum.x;
smem[tid + nthreads] = sum.y;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
result[bid] = res;
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
result[0] = res;
blocks_finished = 0;
}
}
#else
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
result[bid] = res;
}
#endif
}
template <typename T, typename R, int nthreads>
__global__ void sum_pass2_kernel_C2(typename TypeVec<R, 2>::vec_t* result, int size)
{
typedef typename TypeVec<R, 2>::vec_t DstType;
__shared__ R smem[nthreads * 2];
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
result[0] = res;
}
}
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum_kernel_C3(const DevMem2D src, typename TypeVec<R, 3>::vec_t* result)
{
typedef typename TypeVec<T, 3>::vec_t SrcType;
typedef typename TypeVec<R, 3>::vec_t DstType;
__shared__ R smem[nthreads * 3];
const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
SrcType val;
DstType sum = VecTraits<DstType>::all(0);
for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
{
val = ptr[x0 + x * blockDim.x];
sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y), Op::call(val.z));
}
}
smem[tid] = sum.x;
smem[tid + nthreads] = sum.y;
smem[tid + 2 * nthreads] = sum.z;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
result[bid] = res;
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
smem[tid + 2 * nthreads] = res.z;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
result[0] = res;
blocks_finished = 0;
}
}
#else
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
result[bid] = res;
}
#endif
}
template <typename T, typename R, int nthreads>
__global__ void sum_pass2_kernel_C3(typename TypeVec<R, 3>::vec_t* result, int size)
{
typedef typename TypeVec<R, 3>::vec_t DstType;
__shared__ R smem[nthreads * 3];
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
smem[tid + 2 * nthreads] = res.z;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
result[0] = res;
}
}
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum_kernel_C4(const DevMem2D src, typename TypeVec<R, 4>::vec_t* result)
{
typedef typename TypeVec<T, 4>::vec_t SrcType;
typedef typename TypeVec<R, 4>::vec_t DstType;
__shared__ R smem[nthreads * 4];
const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
SrcType val;
DstType sum = VecTraits<DstType>::all(0);
for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
{
val = ptr[x0 + x * blockDim.x];
sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y),
Op::call(val.z), Op::call(val.w));
}
}
smem[tid] = sum.x;
smem[tid + nthreads] = sum.y;
smem[tid + 2 * nthreads] = sum.z;
smem[tid + 3 * nthreads] = sum.w;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
res.w = smem[3 * nthreads];
result[bid] = res;
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
smem[tid + 2 * nthreads] = res.z;
smem[tid + 3 * nthreads] = res.w;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
res.w = smem[3 * nthreads];
result[0] = res;
blocks_finished = 0;
}
}
#else
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
res.w = smem[3 * nthreads];
result[bid] = res;
}
#endif
}
template <typename T, typename R, int nthreads>
__global__ void sum_pass2_kernel_C4(typename TypeVec<R, 4>::vec_t* result, int size)
{
typedef typename TypeVec<R, 4>::vec_t DstType;
__shared__ R smem[nthreads * 4];
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
smem[tid + 2 * nthreads] = res.z;
smem[tid + 3 * nthreads] = res.z;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
res.w = smem[3 * nthreads];
result[0] = res;
}
}
} // namespace sum
template <typename T>
void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
using namespace sum;
typedef typename SumType<T>::R R;
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
switch (cn)
{
case 1:
sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
case 2:
sum_kernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
case 3:
sum_kernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
case 4:
sum_kernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
}
cudaSafeCall(cudaThreadSynchronize());
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
sum[0] = result[0];
sum[1] = result[1];
sum[2] = result[2];
sum[3] = result[3];
}
template void sum_multipass_caller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<char>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<short>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<int>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<float>(const DevMem2D, PtrStep, double*, int);
template <typename T>
void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
using namespace sum;
typedef typename SumType<T>::R R;
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
switch (cn)
{
case 1:
sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
break;
case 2:
sum_kernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
break;
case 3:
sum_kernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
break;
case 4:
sum_kernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
break;
}
cudaSafeCall(cudaThreadSynchronize());
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
sum[0] = result[0];
sum[1] = result[1];
sum[2] = result[2];
sum[3] = result[3];
}
template void sum_caller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<char>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<short>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<int>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<float>(const DevMem2D, PtrStep, double*, int);
template <typename T>
void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
using namespace sum;
typedef typename SumType<T>::R R;
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
switch (cn)
{
case 1:
sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
case 2:
sum_kernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
case 3:
sum_kernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
case 4:
sum_kernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
}
cudaSafeCall(cudaThreadSynchronize());
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
sum[0] = result[0];
sum[1] = result[1];
sum[2] = result[2];
sum[3] = result[3];
}
template void sqsum_multipass_caller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<char>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<short>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<int>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<float>(const DevMem2D, PtrStep, double*, int);
template <typename T>
void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
using namespace sum;
typedef typename SumType<T>::R R;
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
switch (cn)
{
case 1:
sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
break;
case 2:
sum_kernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
break;
case 3:
sum_kernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
break;
case 4:
sum_kernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
break;
}
cudaSafeCall(cudaThreadSynchronize());
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
sum[0] = result[0];
sum[1] = result[1];
sum[2] = result[2];
sum[3] = result[3];
}
template void sqsum_caller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<char>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<short>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<int>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<float>(const DevMem2D, PtrStep, double*, int);
}}}
...
...
modules/gpu/src/cuda/matrix_reductions.cu
View file @
df852937
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "opencv2/gpu/device/limits_gpu.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/vecmath.hpp"
#include "transform.hpp"
#include "internal_shared.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace mathfunc
{
// Performs reduction in shared memory
template <int size, typename T>
__device__ void sum_in_smem(volatile T* data, const uint tid)
{
T sum = data[tid];
if (size >= 512) { if (tid < 256) { data[tid] = sum = sum + data[tid + 256]; } __syncthreads(); }
if (size >= 256) { if (tid < 128) { data[tid] = sum = sum + data[tid + 128]; } __syncthreads(); }
if (size >= 128) { if (tid < 64) { data[tid] = sum = sum + data[tid + 64]; } __syncthreads(); }
if (tid < 32)
{
if (size >= 64) data[tid] = sum = sum + data[tid + 32];
if (size >= 32) data[tid] = sum = sum + data[tid + 16];
if (size >= 16) data[tid] = sum = sum + data[tid + 8];
if (size >= 8) data[tid] = sum = sum + data[tid + 4];
if (size >= 4) data[tid] = sum = sum + data[tid + 2];
if (size >= 2) data[tid] = sum = sum + data[tid + 1];
}
}
struct Mask8U
{
explicit Mask8U(PtrStep mask): mask(mask) {}
__device__ bool operator()(int y, int x) const
{
return mask.ptr(y)[x];
}
PtrStep mask;
};
struct MaskTrue
{
__device__ bool operator()(int y, int x) const
{
return true;
}
};
//////////////////////////////////////////////////////////////////////////////
// Min max
// To avoid shared bank conflicts we convert each value into value of
// appropriate type (32 bits minimum)
template <typename T> struct MinMaxTypeTraits {};
template <> struct MinMaxTypeTraits<uchar> { typedef int best_type; };
template <> struct MinMaxTypeTraits<char> { typedef int best_type; };
template <> struct MinMaxTypeTraits<ushort> { typedef int best_type; };
template <> struct MinMaxTypeTraits<short> { typedef int best_type; };
template <> struct MinMaxTypeTraits<int> { typedef int best_type; };
template <> struct MinMaxTypeTraits<float> { typedef float best_type; };
template <> struct MinMaxTypeTraits<double> { typedef double best_type; };
namespace minmax
{
__constant__ int ctwidth;
__constant__ int ctheight;
// Global counter of blocks finished its work
__device__ uint blocks_finished = 0;
// Estimates good thread configuration
// - threads variable satisfies to threads.x * threads.y == 256
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid.x = min(grid.x, threads.x);
grid.y = min(grid.y, threads.y);
}
// Returns required buffer sizes
void get_buf_size_required(int cols, int rows, int elem_size, int& bufcols, int& bufrows)
{
dim3 threads, grid;
estimate_thread_cfg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * elem_size;
bufrows = 2;
}
// Estimates device constants which are used in the kernels using specified thread configuration
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth)));
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight)));
}
// Does min and max in shared memory
template <typename T>
__device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval)
{
minval[tid] = min(minval[tid], minval[tid + offset]);
maxval[tid] = max(maxval[tid], maxval[tid + offset]);
}
template <int size, typename T>
__device__ void find_min_max_in_smem(volatile T* minval, volatile T* maxval, const uint tid)
{
if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval); } __syncthreads(); }
if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval); } __syncthreads(); }
if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval); } __syncthreads(); }
if (tid < 32)
{
if (size >= 64) merge(tid, 32, minval, maxval);
if (size >= 32) merge(tid, 16, minval, maxval);
if (size >= 16) merge(tid, 8, minval, maxval);
if (size >= 8) merge(tid, 4, minval, maxval);
if (size >= 4) merge(tid, 2, minval, maxval);
if (size >= 2) merge(tid, 1, minval, maxval);
}
}
template <int nthreads, typename T, typename Mask>
__global__ void min_max_kernel(const DevMem2D src, Mask mask, T* minval, T* maxval)
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
__shared__ best_type smaxval[nthreads];
uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
T mymin = numeric_limits_gpu<T>::max();
T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() : numeric_limits_gpu<T>::min();
uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
for (uint y = y0; y < y_end; y += blockDim.y)
{
const T* src_row = (const T*)src.ptr(y);
for (uint x = x0; x < x_end; x += blockDim.x)
{
T val = src_row[x];
if (mask(y, x))
{
mymin = min(mymin, val);
mymax = max(mymax, val);
}
}
}
sminval[tid] = mymin;
smaxval[tid] = mymax;
__syncthreads();
find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0)
{
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
}
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = ticket == gridDim.x * gridDim.y - 1;
}
__syncthreads();
if (is_last)
{
uint idx = min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx];
__syncthreads();
find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0)
{
minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0];
blocks_finished = 0;
}
}
#else
if (tid == 0)
{
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
}
#endif
}
template <typename T>
void min_max_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
min_max_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
}
template void min_max_mask_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_caller<double>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template <typename T>
void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
min_max_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
}
template void min_max_caller<uchar>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<char>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<ushort>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<short>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<int>(const DevMem2D, double*, double*, PtrStep);
template void min_max_caller<float>(const DevMem2D, double*,double*, PtrStep);
template void min_max_caller<double>(const DevMem2D, double*, double*, PtrStep);
template <int nthreads, typename T>
__global__ void min_max_pass2_kernel(T* minval, T* maxval, int size)
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
__shared__ best_type smaxval[nthreads];
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
uint idx = min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx];
__syncthreads();
find_min_max_in_smem<nthreads, best_type>(sminval, smaxval, tid);
if (tid == 0)
{
minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0];
}
}
template <typename T>
void min_max_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
min_max_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
min_max_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
}
template void min_max_mask_multipass_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<char>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<short>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<int>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template void min_max_mask_multipass_caller<float>(const DevMem2D, const PtrStep, double*, double*, PtrStep);
template <typename T>
void min_max_multipass_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)buf.ptr(0);
T* maxval_buf = (T*)buf.ptr(1);
min_max_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
min_max_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
}
template void min_max_multipass_caller<uchar>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<char>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<ushort>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<short>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<int>(const DevMem2D, double*, double*, PtrStep);
template void min_max_multipass_caller<float>(const DevMem2D, double*, double*, PtrStep);
} // namespace minmax
///////////////////////////////////////////////////////////////////////////////
// minMaxLoc
namespace minmaxloc {
__constant__ int ctwidth;
__constant__ int ctheight;
// Global counter of blocks finished its work
__device__ uint blocks_finished = 0;
// Estimates good thread configuration
// - threads variable satisfies to threads.x * threads.y == 256
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid.x = min(grid.x, threads.x);
grid.y = min(grid.y, threads.y);
}
// Returns required buffer sizes
void get_buf_size_required(int cols, int rows, int elem_size, int& b1cols,
int& b1rows, int& b2cols, int& b2rows)
{
dim3 threads, grid;
estimate_thread_cfg(cols, rows, threads, grid);
b1cols = grid.x * grid.y * elem_size; // For values
b1rows = 2;
b2cols = grid.x * grid.y * sizeof(int); // For locations
b2rows = 2;
}
// Estimates device constants which are used in the kernels using specified thread configuration
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(ctwidth)));
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight)));
}
template <typename T>
__device__ void merge(uint tid, uint offset, volatile T* minval, volatile T* maxval,
volatile uint* minloc, volatile uint* maxloc)
{
T val = minval[tid + offset];
if (val < minval[tid])
{
minval[tid] = val;
minloc[tid] = minloc[tid + offset];
}
val = maxval[tid + offset];
if (val > maxval[tid])
{
maxval[tid] = val;
maxloc[tid] = maxloc[tid + offset];
}
}
template <int size, typename T>
__device__ void find_min_max_loc_in_smem(volatile T* minval, volatile T* maxval, volatile uint* minloc,
volatile uint* maxloc, const uint tid)
{
if (size >= 512) { if (tid < 256) { merge(tid, 256, minval, maxval, minloc, maxloc); } __syncthreads(); }
if (size >= 256) { if (tid < 128) { merge(tid, 128, minval, maxval, minloc, maxloc); } __syncthreads(); }
if (size >= 128) { if (tid < 64) { merge(tid, 64, minval, maxval, minloc, maxloc); } __syncthreads(); }
if (tid < 32)
{
if (size >= 64) merge(tid, 32, minval, maxval, minloc, maxloc);
if (size >= 32) merge(tid, 16, minval, maxval, minloc, maxloc);
if (size >= 16) merge(tid, 8, minval, maxval, minloc, maxloc);
if (size >= 8) merge(tid, 4, minval, maxval, minloc, maxloc);
if (size >= 4) merge(tid, 2, minval, maxval, minloc, maxloc);
if (size >= 2) merge(tid, 1, minval, maxval, minloc, maxloc);
}
}
template <int nthreads, typename T, typename Mask>
__global__ void min_max_loc_kernel(const DevMem2D src, Mask mask, T* minval, T* maxval,
uint* minloc, uint* maxloc)
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
__shared__ best_type smaxval[nthreads];
__shared__ uint sminloc[nthreads];
__shared__ uint smaxloc[nthreads];
uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
T mymin = numeric_limits_gpu<T>::max();
T mymax = numeric_limits_gpu<T>::is_signed ? -numeric_limits_gpu<T>::max() : numeric_limits_gpu<T>::min();
uint myminloc = 0;
uint mymaxloc = 0;
uint y_end = min(y0 + (ctheight - 1) * blockDim.y + 1, src.rows);
uint x_end = min(x0 + (ctwidth - 1) * blockDim.x + 1, src.cols);
for (uint y = y0; y < y_end; y += blockDim.y)
{
const T* ptr = (const T*)src.ptr(y);
for (uint x = x0; x < x_end; x += blockDim.x)
{
if (mask(y, x))
{
T val = ptr[x];
if (val <= mymin) { mymin = val; myminloc = y * src.cols + x; }
if (val >= mymax) { mymax = val; mymaxloc = y * src.cols + x; }
}
}
}
sminval[tid] = mymin;
smaxval[tid] = mymax;
sminloc[tid] = myminloc;
smaxloc[tid] = mymaxloc;
__syncthreads();
find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];
maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = ticket == gridDim.x * gridDim.y - 1;
}
__syncthreads();
if (is_last)
{
uint idx = min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx];
sminloc[tid] = minloc[idx];
smaxloc[tid] = maxloc[idx];
__syncthreads();
find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
if (tid == 0)
{
minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0];
minloc[0] = sminloc[0];
maxloc[0] = smaxloc[0];
blocks_finished = 0;
}
}
#else
if (tid == 0)
{
minval[blockIdx.y * gridDim.x + blockIdx.x] = (T)sminval[0];
maxval[blockIdx.y * gridDim.x + blockIdx.x] = (T)smaxval[0];
minloc[blockIdx.y * gridDim.x + blockIdx.x] = sminloc[0];
maxloc[blockIdx.y * gridDim.x + blockIdx.x] = smaxloc[0];
}
#endif
}
template <typename T>
void min_max_loc_mask_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
uint minloc_, maxloc_;
cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void min_max_loc_mask_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_caller<double>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template <typename T>
void min_max_loc_caller(const DevMem2D src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
uint minloc_, maxloc_;
cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void min_max_loc_caller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_caller<double>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
// This kernel will be used only when compute capability is 1.0
template <int nthreads, typename T>
__global__ void min_max_loc_pass2_kernel(T* minval, T* maxval, uint* minloc, uint* maxloc, int size)
{
typedef typename MinMaxTypeTraits<T>::best_type best_type;
__shared__ best_type sminval[nthreads];
__shared__ best_type smaxval[nthreads];
__shared__ uint sminloc[nthreads];
__shared__ uint smaxloc[nthreads];
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
uint idx = min(tid, gridDim.x * gridDim.y - 1);
sminval[tid] = minval[idx];
smaxval[tid] = maxval[idx];
sminloc[tid] = minloc[idx];
smaxloc[tid] = maxloc[idx];
__syncthreads();
find_min_max_loc_in_smem<nthreads, best_type>(sminval, smaxval, sminloc, smaxloc, tid);
if (tid == 0)
{
minval[0] = (T)sminval[0];
maxval[0] = (T)smaxval[0];
minloc[0] = sminloc[0];
maxloc[0] = smaxloc[0];
}
}
template <typename T>
void min_max_loc_mask_multipass_caller(const DevMem2D src, const PtrStep mask, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
uint minloc_, maxloc_;
cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void min_max_loc_mask_multipass_caller<uchar>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<char>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<ushort>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<short>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<int>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_mask_multipass_caller<float>(const DevMem2D, const PtrStep, double*, double*, int[2], int[2], PtrStep, PtrStep);
template <typename T>
void min_max_loc_multipass_caller(const DevMem2D src, double* minval, double* maxval,
int minloc[2], int maxloc[2], PtrStep valbuf, PtrStep locbuf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
T* minval_buf = (T*)valbuf.ptr(0);
T* maxval_buf = (T*)valbuf.ptr(1);
uint* minloc_buf = (uint*)locbuf.ptr(0);
uint* maxloc_buf = (uint*)locbuf.ptr(1);
min_max_loc_kernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf, minloc_buf, maxloc_buf);
min_max_loc_pass2_kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
cudaSafeCall(cudaMemcpy(&minval_, minval_buf, sizeof(T), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
*minval = minval_;
*maxval = maxval_;
uint minloc_, maxloc_;
cudaSafeCall(cudaMemcpy(&minloc_, minloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
cudaSafeCall(cudaMemcpy(&maxloc_, maxloc_buf, sizeof(int), cudaMemcpyDeviceToHost));
minloc[1] = minloc_ / src.cols; minloc[0] = minloc_ - minloc[1] * src.cols;
maxloc[1] = maxloc_ / src.cols; maxloc[0] = maxloc_ - maxloc[1] * src.cols;
}
template void min_max_loc_multipass_caller<uchar>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<char>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<ushort>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<short>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<int>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
template void min_max_loc_multipass_caller<float>(const DevMem2D, double*, double*, int[2], int[2], PtrStep, PtrStep);
} // namespace minmaxloc
//////////////////////////////////////////////////////////////////////////////////////////////////////////
// countNonZero
namespace countnonzero
{
__constant__ int ctwidth;
__constant__ int ctheight;
__device__ uint blocks_finished = 0;
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(32, 8);
grid = dim3(divUp(cols, threads.x * 8), divUp(rows, threads.y * 32));
grid.x = min(grid.x, threads.x);
grid.y = min(grid.y, threads.y);
}
void get_buf_size_required(int cols, int rows, int& bufcols, int& bufrows)
{
dim3 threads, grid;
estimate_thread_cfg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * sizeof(int);
bufrows = 1;
}
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
}
template <int nthreads, typename T>
__global__ void count_non_zero_kernel(const DevMem2D src, volatile uint* count)
{
__shared__ uint scount[nthreads];
uint x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
uint y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
uint cnt = 0;
for (uint y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
for (uint x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
cnt += ptr[x0 + x * blockDim.x] != 0;
}
scount[tid] = cnt;
__syncthreads();
sum_in_smem<nthreads, uint>(scount, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = ticket == gridDim.x * gridDim.y - 1;
}
__syncthreads();
if (is_last)
{
scount[tid] = tid < gridDim.x * gridDim.y ? count[tid] : 0;
__syncthreads();
sum_in_smem<nthreads, uint>(scount, tid);
if (tid == 0)
{
count[0] = scount[0];
blocks_finished = 0;
}
}
#else
if (tid == 0) count[blockIdx.y * gridDim.x + blockIdx.x] = scount[0];
#endif
}
template <typename T>
int count_non_zero_caller(const DevMem2D src, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
uint* count_buf = (uint*)buf.ptr(0);
count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
cudaSafeCall(cudaThreadSynchronize());
uint count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
return count;
}
template int count_non_zero_caller<uchar>(const DevMem2D, PtrStep);
template int count_non_zero_caller<char>(const DevMem2D, PtrStep);
template int count_non_zero_caller<ushort>(const DevMem2D, PtrStep);
template int count_non_zero_caller<short>(const DevMem2D, PtrStep);
template int count_non_zero_caller<int>(const DevMem2D, PtrStep);
template int count_non_zero_caller<float>(const DevMem2D, PtrStep);
template int count_non_zero_caller<double>(const DevMem2D, PtrStep);
template <int nthreads, typename T>
__global__ void count_non_zero_pass2_kernel(uint* count, int size)
{
__shared__ uint scount[nthreads];
uint tid = threadIdx.y * blockDim.x + threadIdx.x;
scount[tid] = tid < size ? count[tid] : 0;
__syncthreads();
sum_in_smem<nthreads, uint>(scount, tid);
if (tid == 0)
count[0] = scount[0];
}
template <typename T>
int count_non_zero_multipass_caller(const DevMem2D src, PtrStep buf)
{
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
uint* count_buf = (uint*)buf.ptr(0);
count_non_zero_kernel<256, T><<<grid, threads>>>(src, count_buf);
count_non_zero_pass2_kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
cudaSafeCall(cudaThreadSynchronize());
uint count;
cudaSafeCall(cudaMemcpy(&count, count_buf, sizeof(int), cudaMemcpyDeviceToHost));
return count;
}
template int count_non_zero_multipass_caller<uchar>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<char>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<ushort>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<short>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<int>(const DevMem2D, PtrStep);
template int count_non_zero_multipass_caller<float>(const DevMem2D, PtrStep);
} // namespace countnonzero
//////////////////////////////////////////////////////////////////////////
// Sum
namespace sum
{
template <typename T> struct SumType {};
template <> struct SumType<uchar> { typedef uint R; };
template <> struct SumType<char> { typedef int R; };
template <> struct SumType<ushort> { typedef uint R; };
template <> struct SumType<short> { typedef int R; };
template <> struct SumType<int> { typedef int R; };
template <> struct SumType<float> { typedef float R; };
template <> struct SumType<double> { typedef double R; };
template <typename R>
struct IdentityOp { static __device__ R call(R x) { return x; } };
template <typename R>
struct SqrOp { static __device__ R call(R x) { return x * x; } };
__constant__ int ctwidth;
__constant__ int ctheight;
__device__ uint blocks_finished = 0;
const int threads_x = 32;
const int threads_y = 8;
void estimate_thread_cfg(int cols, int rows, dim3& threads, dim3& grid)
{
threads = dim3(threads_x, threads_y);
grid = dim3(divUp(cols, threads.x * threads.y),
divUp(rows, threads.y * threads.x));
grid.x = min(grid.x, threads.x);
grid.y = min(grid.y, threads.y);
}
void get_buf_size_required(int cols, int rows, int cn, int& bufcols, int& bufrows)
{
dim3 threads, grid;
estimate_thread_cfg(cols, rows, threads, grid);
bufcols = grid.x * grid.y * sizeof(double) * cn;
bufrows = 1;
}
void set_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
{
int twidth = divUp(divUp(cols, grid.x), threads.x);
int theight = divUp(divUp(rows, grid.y), threads.y);
cudaSafeCall(cudaMemcpyToSymbol(ctwidth, &twidth, sizeof(twidth)));
cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(theight)));
}
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum_kernel(const DevMem2D src, R* result)
{
__shared__ R smem[nthreads];
const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
R sum = 0;
for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const T* ptr = (const T*)src.ptr(y0 + y * blockDim.y);
for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
sum += Op::call(ptr[x0 + x * blockDim.x]);
}
smem[tid] = sum;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
result[bid] = smem[0];
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
smem[tid] = tid < gridDim.x * gridDim.y ? result[tid] : 0;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
if (tid == 0)
{
result[0] = smem[0];
blocks_finished = 0;
}
}
#else
if (tid == 0) result[bid] = smem[0];
#endif
}
template <typename T, typename R, int nthreads>
__global__ void sum_pass2_kernel(R* result, int size)
{
__shared__ R smem[nthreads];
int tid = threadIdx.y * blockDim.x + threadIdx.x;
smem[tid] = tid < size ? result[tid] : 0;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
if (tid == 0)
result[0] = smem[0];
}
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum_kernel_C2(const DevMem2D src, typename TypeVec<R, 2>::vec_t* result)
{
typedef typename TypeVec<T, 2>::vec_t SrcType;
typedef typename TypeVec<R, 2>::vec_t DstType;
__shared__ R smem[nthreads * 2];
const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
SrcType val;
DstType sum = VecTraits<DstType>::all(0);
for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
{
val = ptr[x0 + x * blockDim.x];
sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y));
}
}
smem[tid] = sum.x;
smem[tid + nthreads] = sum.y;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
result[bid] = res;
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
result[0] = res;
blocks_finished = 0;
}
}
#else
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
result[bid] = res;
}
#endif
}
template <typename T, typename R, int nthreads>
__global__ void sum_pass2_kernel_C2(typename TypeVec<R, 2>::vec_t* result, int size)
{
typedef typename TypeVec<R, 2>::vec_t DstType;
__shared__ R smem[nthreads * 2];
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
result[0] = res;
}
}
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum_kernel_C3(const DevMem2D src, typename TypeVec<R, 3>::vec_t* result)
{
typedef typename TypeVec<T, 3>::vec_t SrcType;
typedef typename TypeVec<R, 3>::vec_t DstType;
__shared__ R smem[nthreads * 3];
const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
SrcType val;
DstType sum = VecTraits<DstType>::all(0);
for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
{
val = ptr[x0 + x * blockDim.x];
sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y), Op::call(val.z));
}
}
smem[tid] = sum.x;
smem[tid + nthreads] = sum.y;
smem[tid + 2 * nthreads] = sum.z;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
result[bid] = res;
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
smem[tid + 2 * nthreads] = res.z;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
result[0] = res;
blocks_finished = 0;
}
}
#else
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
result[bid] = res;
}
#endif
}
template <typename T, typename R, int nthreads>
__global__ void sum_pass2_kernel_C3(typename TypeVec<R, 3>::vec_t* result, int size)
{
typedef typename TypeVec<R, 3>::vec_t DstType;
__shared__ R smem[nthreads * 3];
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
smem[tid + 2 * nthreads] = res.z;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
result[0] = res;
}
}
template <typename T, typename R, typename Op, int nthreads>
__global__ void sum_kernel_C4(const DevMem2D src, typename TypeVec<R, 4>::vec_t* result)
{
typedef typename TypeVec<T, 4>::vec_t SrcType;
typedef typename TypeVec<R, 4>::vec_t DstType;
__shared__ R smem[nthreads * 4];
const int x0 = blockIdx.x * blockDim.x * ctwidth + threadIdx.x;
const int y0 = blockIdx.y * blockDim.y * ctheight + threadIdx.y;
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int bid = blockIdx.y * gridDim.x + blockIdx.x;
SrcType val;
DstType sum = VecTraits<DstType>::all(0);
for (int y = 0; y < ctheight && y0 + y * blockDim.y < src.rows; ++y)
{
const SrcType* ptr = (const SrcType*)src.ptr(y0 + y * blockDim.y);
for (int x = 0; x < ctwidth && x0 + x * blockDim.x < src.cols; ++x)
{
val = ptr[x0 + x * blockDim.x];
sum = sum + VecTraits<DstType>::make(Op::call(val.x), Op::call(val.y),
Op::call(val.z), Op::call(val.w));
}
}
smem[tid] = sum.x;
smem[tid + nthreads] = sum.y;
smem[tid + 2 * nthreads] = sum.z;
smem[tid + 3 * nthreads] = sum.w;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid);
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 110
__shared__ bool is_last;
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
res.w = smem[3 * nthreads];
result[bid] = res;
__threadfence();
uint ticket = atomicInc(&blocks_finished, gridDim.x * gridDim.y);
is_last = (ticket == gridDim.x * gridDim.y - 1);
}
__syncthreads();
if (is_last)
{
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
smem[tid + 2 * nthreads] = res.z;
smem[tid + 3 * nthreads] = res.w;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
res.w = smem[3 * nthreads];
result[0] = res;
blocks_finished = 0;
}
}
#else
if (tid == 0)
{
DstType res;
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
res.w = smem[3 * nthreads];
result[bid] = res;
}
#endif
}
template <typename T, typename R, int nthreads>
__global__ void sum_pass2_kernel_C4(typename TypeVec<R, 4>::vec_t* result, int size)
{
typedef typename TypeVec<R, 4>::vec_t DstType;
__shared__ R smem[nthreads * 4];
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
DstType res = tid < gridDim.x * gridDim.y ? result[tid] : VecTraits<DstType>::all(0);
smem[tid] = res.x;
smem[tid + nthreads] = res.y;
smem[tid + 2 * nthreads] = res.z;
smem[tid + 3 * nthreads] = res.z;
__syncthreads();
sum_in_smem<nthreads, R>(smem, tid);
sum_in_smem<nthreads, R>(smem + nthreads, tid);
sum_in_smem<nthreads, R>(smem + 2 * nthreads, tid);
sum_in_smem<nthreads, R>(smem + 3 * nthreads, tid);
if (tid == 0)
{
res.x = smem[0];
res.y = smem[nthreads];
res.z = smem[2 * nthreads];
res.w = smem[3 * nthreads];
result[0] = res;
}
}
} // namespace sum
template <typename T>
void sum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
using namespace sum;
typedef typename SumType<T>::R R;
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
switch (cn)
{
case 1:
sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
case 2:
sum_kernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
case 3:
sum_kernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
case 4:
sum_kernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
}
cudaSafeCall(cudaThreadSynchronize());
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
sum[0] = result[0];
sum[1] = result[1];
sum[2] = result[2];
sum[3] = result[3];
}
template void sum_multipass_caller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<char>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<short>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<int>(const DevMem2D, PtrStep, double*, int);
template void sum_multipass_caller<float>(const DevMem2D, PtrStep, double*, int);
template <typename T>
void sum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
using namespace sum;
typedef typename SumType<T>::R R;
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
switch (cn)
{
case 1:
sum_kernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
break;
case 2:
sum_kernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
break;
case 3:
sum_kernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
break;
case 4:
sum_kernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
break;
}
cudaSafeCall(cudaThreadSynchronize());
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(&result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
sum[0] = result[0];
sum[1] = result[1];
sum[2] = result[2];
sum[3] = result[3];
}
template void sum_caller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<char>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<short>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<int>(const DevMem2D, PtrStep, double*, int);
template void sum_caller<float>(const DevMem2D, PtrStep, double*, int);
template <typename T>
void sqsum_multipass_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
using namespace sum;
typedef typename SumType<T>::R R;
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
switch (cn)
{
case 1:
sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
sum_pass2_kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
case 2:
sum_kernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
case 3:
sum_kernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
case 4:
sum_kernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
sum_pass2_kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
break;
}
cudaSafeCall(cudaThreadSynchronize());
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
sum[0] = result[0];
sum[1] = result[1];
sum[2] = result[2];
sum[3] = result[3];
}
template void sqsum_multipass_caller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<char>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<short>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<int>(const DevMem2D, PtrStep, double*, int);
template void sqsum_multipass_caller<float>(const DevMem2D, PtrStep, double*, int);
template <typename T>
void sqsum_caller(const DevMem2D src, PtrStep buf, double* sum, int cn)
{
using namespace sum;
typedef typename SumType<T>::R R;
dim3 threads, grid;
estimate_thread_cfg(src.cols, src.rows, threads, grid);
set_kernel_consts(src.cols, src.rows, threads, grid);
switch (cn)
{
case 1:
sum_kernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
break;
case 2:
sum_kernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
break;
case 3:
sum_kernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
break;
case 4:
sum_kernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
break;
}
cudaSafeCall(cudaThreadSynchronize());
R result[4] = {0, 0, 0, 0};
cudaSafeCall(cudaMemcpy(result, buf.ptr(0), sizeof(R) * cn, cudaMemcpyDeviceToHost));
sum[0] = result[0];
sum[1] = result[1];
sum[2] = result[2];
sum[3] = result[3];
}
template void sqsum_caller<uchar>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<char>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<ushort>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<short>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<int>(const DevMem2D, PtrStep, double*, int);
template void sqsum_caller<float>(const DevMem2D, PtrStep, double*, int);
}}}
\ No newline at end of file
modules/gpu/src/element_operations.cpp
View file @
df852937
...
...
@@ -66,10 +66,14 @@ void cv::gpu::bitwise_and(const GpuMat&, const GpuMat&, GpuMat&, const GpuMat&)
void
cv
::
gpu
::
bitwise_and
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
const
GpuMat
&
,
const
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
bitwise_xor
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
const
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
bitwise_xor
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
const
GpuMat
&
,
const
Stream
&
)
{
throw_nogpu
();
}
cv
::
gpu
::
GpuMat
cv
::
gpu
::
operator
~
(
const
GpuMat
&
)
{
throw_nogpu
();
return
GpuMat
();
}
cv
::
gpu
::
GpuMat
cv
::
gpu
::
operator
|
(
const
GpuMat
&
,
const
GpuMat
&
)
{
throw_nogpu
();
return
GpuMat
();
}
cv
::
gpu
::
GpuMat
cv
::
gpu
::
operator
&
(
const
GpuMat
&
,
const
GpuMat
&
)
{
throw_nogpu
();
return
GpuMat
();
}
cv
::
gpu
::
GpuMat
cv
::
gpu
::
operator
^
(
const
GpuMat
&
,
const
GpuMat
&
)
{
throw_nogpu
();
return
GpuMat
();
}
void
cv
::
gpu
::
min
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
min
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
const
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
min
(
const
GpuMat
&
,
double
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
min
(
const
GpuMat
&
,
double
,
GpuMat
&
,
const
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
max
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
max
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
const
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
max
(
const
GpuMat
&
,
double
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
max
(
const
GpuMat
&
,
double
,
GpuMat
&
,
const
Stream
&
)
{
throw_nogpu
();
}
#else
...
...
@@ -574,4 +578,144 @@ void cv::gpu::bitwise_xor(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, c
::
bitwiseXorCaller
(
src1
,
src2
,
dst
,
mask
,
StreamAccessor
::
getStream
(
stream
));
}
//////////////////////////////////////////////////////////////////////////////
// Minimum and maximum operations
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
{
template
<
typename
T
>
void
min_gpu
(
const
DevMem2D_
<
T
>&
src1
,
const
DevMem2D_
<
T
>&
src2
,
const
DevMem2D_
<
T
>&
dst
,
cudaStream_t
stream
);
template
<
typename
T
>
void
max_gpu
(
const
DevMem2D_
<
T
>&
src1
,
const
DevMem2D_
<
T
>&
src2
,
const
DevMem2D_
<
T
>&
dst
,
cudaStream_t
stream
);
template
<
typename
T
>
void
min_gpu
(
const
DevMem2D_
<
T
>&
src1
,
double
src2
,
const
DevMem2D_
<
T
>&
dst
,
cudaStream_t
stream
);
template
<
typename
T
>
void
max_gpu
(
const
DevMem2D_
<
T
>&
src1
,
double
src2
,
const
DevMem2D_
<
T
>&
dst
,
cudaStream_t
stream
);
}}}
namespace
{
template
<
typename
T
>
void
min_caller
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
)
{
CV_Assert
(
src1
.
size
()
==
src2
.
size
()
&&
src1
.
type
()
==
src2
.
type
());
dst
.
create
(
src1
.
size
(),
src1
.
type
());
mathfunc
::
min_gpu
<
T
>
(
src1
.
reshape
(
1
),
src2
.
reshape
(
1
),
dst
.
reshape
(
1
),
stream
);
}
template
<
typename
T
>
void
min_caller
(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
)
{
dst
.
create
(
src1
.
size
(),
src1
.
type
());
mathfunc
::
min_gpu
<
T
>
(
src1
.
reshape
(
1
),
src2
,
dst
.
reshape
(
1
),
stream
);
}
template
<
typename
T
>
void
max_caller
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
)
{
CV_Assert
(
src1
.
size
()
==
src2
.
size
()
&&
src1
.
type
()
==
src2
.
type
());
dst
.
create
(
src1
.
size
(),
src1
.
type
());
mathfunc
::
max_gpu
<
T
>
(
src1
.
reshape
(
1
),
src2
.
reshape
(
1
),
dst
.
reshape
(
1
),
stream
);
}
template
<
typename
T
>
void
max_caller
(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
)
{
dst
.
create
(
src1
.
size
(),
src1
.
type
());
mathfunc
::
max_gpu
<
T
>
(
src1
.
reshape
(
1
),
src2
,
dst
.
reshape
(
1
),
stream
);
}
}
void
cv
::
gpu
::
min
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
)
{
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
min_caller
<
uchar
>
,
min_caller
<
char
>
,
min_caller
<
ushort
>
,
min_caller
<
short
>
,
min_caller
<
int
>
,
min_caller
<
float
>
,
min_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
0
);
}
void
cv
::
gpu
::
min
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
const
Stream
&
stream
)
{
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
min_caller
<
uchar
>
,
min_caller
<
char
>
,
min_caller
<
ushort
>
,
min_caller
<
short
>
,
min_caller
<
int
>
,
min_caller
<
float
>
,
min_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
StreamAccessor
::
getStream
(
stream
));
}
void
cv
::
gpu
::
min
(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
)
{
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
min_caller
<
uchar
>
,
min_caller
<
char
>
,
min_caller
<
ushort
>
,
min_caller
<
short
>
,
min_caller
<
int
>
,
min_caller
<
float
>
,
min_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
0
);
}
void
cv
::
gpu
::
min
(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
const
Stream
&
stream
)
{
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
min_caller
<
uchar
>
,
min_caller
<
char
>
,
min_caller
<
ushort
>
,
min_caller
<
short
>
,
min_caller
<
int
>
,
min_caller
<
float
>
,
min_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
StreamAccessor
::
getStream
(
stream
));
}
void
cv
::
gpu
::
max
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
)
{
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
max_caller
<
uchar
>
,
max_caller
<
char
>
,
max_caller
<
ushort
>
,
max_caller
<
short
>
,
max_caller
<
int
>
,
max_caller
<
float
>
,
max_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
0
);
}
void
cv
::
gpu
::
max
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
const
Stream
&
stream
)
{
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
max_caller
<
uchar
>
,
max_caller
<
char
>
,
max_caller
<
ushort
>
,
max_caller
<
short
>
,
max_caller
<
int
>
,
max_caller
<
float
>
,
max_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
StreamAccessor
::
getStream
(
stream
));
}
void
cv
::
gpu
::
max
(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
)
{
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
max_caller
<
uchar
>
,
max_caller
<
char
>
,
max_caller
<
ushort
>
,
max_caller
<
short
>
,
max_caller
<
int
>
,
max_caller
<
float
>
,
max_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
0
);
}
void
cv
::
gpu
::
max
(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
const
Stream
&
stream
)
{
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
max_caller
<
uchar
>
,
max_caller
<
char
>
,
max_caller
<
ushort
>
,
max_caller
<
short
>
,
max_caller
<
int
>
,
max_caller
<
float
>
,
max_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
StreamAccessor
::
getStream
(
stream
));
}
#endif
\ No newline at end of file
modules/gpu/src/matrix_reductions.cpp
View file @
df852937
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other GpuMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or bpied warranties, including, but not limited to, the bpied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using
namespace
cv
;
using
namespace
cv
::
gpu
;
#if !defined (HAVE_CUDA)
void
cv
::
gpu
::
meanStdDev
(
const
GpuMat
&
,
Scalar
&
,
Scalar
&
)
{
throw_nogpu
();
}
double
cv
::
gpu
::
norm
(
const
GpuMat
&
,
int
)
{
throw_nogpu
();
return
0.0
;
}
double
cv
::
gpu
::
norm
(
const
GpuMat
&
,
const
GpuMat
&
,
int
)
{
throw_nogpu
();
return
0.0
;
}
Scalar
cv
::
gpu
::
sum
(
const
GpuMat
&
)
{
throw_nogpu
();
return
Scalar
();
}
Scalar
cv
::
gpu
::
sum
(
const
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
return
Scalar
();
}
Scalar
cv
::
gpu
::
sqrSum
(
const
GpuMat
&
)
{
throw_nogpu
();
return
Scalar
();
}
Scalar
cv
::
gpu
::
sqrSum
(
const
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
return
Scalar
();
}
void
cv
::
gpu
::
minMax
(
const
GpuMat
&
,
double
*
,
double
*
,
const
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
minMax
(
const
GpuMat
&
,
double
*
,
double
*
,
const
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
minMaxLoc
(
const
GpuMat
&
,
double
*
,
double
*
,
Point
*
,
Point
*
,
const
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
minMaxLoc
(
const
GpuMat
&
,
double
*
,
double
*
,
Point
*
,
Point
*
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
}
int
cv
::
gpu
::
countNonZero
(
const
GpuMat
&
)
{
throw_nogpu
();
return
0
;
}
int
cv
::
gpu
::
countNonZero
(
const
GpuMat
&
,
GpuMat
&
)
{
throw_nogpu
();
return
0
;
}
#else
////////////////////////////////////////////////////////////////////////
// meanStdDev
void
cv
::
gpu
::
meanStdDev
(
const
GpuMat
&
src
,
Scalar
&
mean
,
Scalar
&
stddev
)
{
CV_Assert
(
src
.
type
()
==
CV_8UC1
);
NppiSize
sz
;
sz
.
width
=
src
.
cols
;
sz
.
height
=
src
.
rows
;
nppSafeCall
(
nppiMean_StdDev_8u_C1R
(
src
.
ptr
<
Npp8u
>
(),
src
.
step
,
sz
,
mean
.
val
,
stddev
.
val
)
);
}
////////////////////////////////////////////////////////////////////////
// norm
double
cv
::
gpu
::
norm
(
const
GpuMat
&
src1
,
int
normType
)
{
return
norm
(
src1
,
GpuMat
(
src1
.
size
(),
src1
.
type
(),
Scalar
::
all
(
0.0
)),
normType
);
}
double
cv
::
gpu
::
norm
(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
int
normType
)
{
CV_DbgAssert
(
src1
.
size
()
==
src2
.
size
()
&&
src1
.
type
()
==
src2
.
type
());
CV_Assert
(
src1
.
type
()
==
CV_8UC1
);
CV_Assert
(
normType
==
NORM_INF
||
normType
==
NORM_L1
||
normType
==
NORM_L2
);
typedef
NppStatus
(
*
npp_norm_diff_func_t
)(
const
Npp8u
*
pSrc1
,
int
nSrcStep1
,
const
Npp8u
*
pSrc2
,
int
nSrcStep2
,
NppiSize
oSizeROI
,
Npp64f
*
pRetVal
);
static
const
npp_norm_diff_func_t
npp_norm_diff_func
[]
=
{
nppiNormDiff_Inf_8u_C1R
,
nppiNormDiff_L1_8u_C1R
,
nppiNormDiff_L2_8u_C1R
};
NppiSize
sz
;
sz
.
width
=
src1
.
cols
;
sz
.
height
=
src1
.
rows
;
int
funcIdx
=
normType
>>
1
;
double
retVal
;
nppSafeCall
(
npp_norm_diff_func
[
funcIdx
](
src1
.
ptr
<
Npp8u
>
(),
src1
.
step
,
src2
.
ptr
<
Npp8u
>
(),
src2
.
step
,
sz
,
&
retVal
)
);
return
retVal
;
}
////////////////////////////////////////////////////////////////////////
// Sum
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
{
template
<
typename
T
>
void
sum_caller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
template
<
typename
T
>
void
sum_multipass_caller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
template
<
typename
T
>
void
sqsum_caller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
template
<
typename
T
>
void
sqsum_multipass_caller
(
const
DevMem2D
src
,
PtrStep
buf
,
double
*
sum
,
int
cn
);
namespace
sum
{
void
get_buf_size_required
(
int
cols
,
int
rows
,
int
cn
,
int
&
bufcols
,
int
&
bufrows
);
}
}}}
Scalar
cv
::
gpu
::
sum
(
const
GpuMat
&
src
)
{
GpuMat
buf
;
return
sum
(
src
,
buf
);
}
Scalar
cv
::
gpu
::
sum
(
const
GpuMat
&
src
,
GpuMat
&
buf
)
{
using
namespace
mathfunc
;
typedef
void
(
*
Caller
)(
const
DevMem2D
,
PtrStep
,
double
*
,
int
);
static
const
Caller
callers
[
2
][
7
]
=
{
{
sum_multipass_caller
<
unsigned
char
>
,
sum_multipass_caller
<
char
>
,
sum_multipass_caller
<
unsigned
short
>
,
sum_multipass_caller
<
short
>
,
sum_multipass_caller
<
int
>
,
sum_multipass_caller
<
float
>
,
0
},
{
sum_caller
<
unsigned
char
>
,
sum_caller
<
char
>
,
sum_caller
<
unsigned
short
>
,
sum_caller
<
short
>
,
sum_caller
<
int
>
,
sum_caller
<
float
>
,
0
}
};
Size
bufSize
;
sum
::
get_buf_size_required
(
src
.
cols
,
src
.
rows
,
src
.
channels
(),
bufSize
.
width
,
bufSize
.
height
);
buf
.
create
(
bufSize
,
CV_8U
);
Caller
caller
=
callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
depth
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"sum: unsupported type"
);
double
result
[
4
];
caller
(
src
,
buf
,
result
,
src
.
channels
());
return
Scalar
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
}
Scalar
cv
::
gpu
::
sqrSum
(
const
GpuMat
&
src
)
{
GpuMat
buf
;
return
sqrSum
(
src
,
buf
);
}
Scalar
cv
::
gpu
::
sqrSum
(
const
GpuMat
&
src
,
GpuMat
&
buf
)
{
using
namespace
mathfunc
;
typedef
void
(
*
Caller
)(
const
DevMem2D
,
PtrStep
,
double
*
,
int
);
static
const
Caller
callers
[
2
][
7
]
=
{
{
sqsum_multipass_caller
<
unsigned
char
>
,
sqsum_multipass_caller
<
char
>
,
sqsum_multipass_caller
<
unsigned
short
>
,
sqsum_multipass_caller
<
short
>
,
sqsum_multipass_caller
<
int
>
,
sqsum_multipass_caller
<
float
>
,
0
},
{
sqsum_caller
<
unsigned
char
>
,
sqsum_caller
<
char
>
,
sqsum_caller
<
unsigned
short
>
,
sqsum_caller
<
short
>
,
sqsum_caller
<
int
>
,
sqsum_caller
<
float
>
,
0
}
};
Size
bufSize
;
sum
::
get_buf_size_required
(
src
.
cols
,
src
.
rows
,
src
.
channels
(),
bufSize
.
width
,
bufSize
.
height
);
buf
.
create
(
bufSize
,
CV_8U
);
Caller
caller
=
callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
depth
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"sqrSum: unsupported type"
);
double
result
[
4
];
caller
(
src
,
buf
,
result
,
src
.
channels
());
return
Scalar
(
result
[
0
],
result
[
1
],
result
[
2
],
result
[
3
]);
}
////////////////////////////////////////////////////////////////////////
// Find min or max
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
{
namespace
minmax
{
void
get_buf_size_required
(
int
cols
,
int
rows
,
int
elem_size
,
int
&
bufcols
,
int
&
bufrows
);
template
<
typename
T
>
void
min_max_caller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
template
<
typename
T
>
void
min_max_mask_caller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
template
<
typename
T
>
void
min_max_multipass_caller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
template
<
typename
T
>
void
min_max_mask_multipass_caller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
PtrStep
buf
);
}}}}
void
cv
::
gpu
::
minMax
(
const
GpuMat
&
src
,
double
*
minVal
,
double
*
maxVal
,
const
GpuMat
&
mask
)
{
GpuMat
buf
;
minMax
(
src
,
minVal
,
maxVal
,
mask
,
buf
);
}
void
cv
::
gpu
::
minMax
(
const
GpuMat
&
src
,
double
*
minVal
,
double
*
maxVal
,
const
GpuMat
&
mask
,
GpuMat
&
buf
)
{
using
namespace
mathfunc
::
minmax
;
typedef
void
(
*
Caller
)(
const
DevMem2D
,
double
*
,
double
*
,
PtrStep
);
typedef
void
(
*
MaskedCaller
)(
const
DevMem2D
,
const
PtrStep
,
double
*
,
double
*
,
PtrStep
);
static
const
Caller
callers
[
2
][
7
]
=
{
{
min_max_multipass_caller
<
unsigned
char
>
,
min_max_multipass_caller
<
char
>
,
min_max_multipass_caller
<
unsigned
short
>
,
min_max_multipass_caller
<
short
>
,
min_max_multipass_caller
<
int
>
,
min_max_multipass_caller
<
float
>
,
0
},
{
min_max_caller
<
unsigned
char
>
,
min_max_caller
<
char
>
,
min_max_caller
<
unsigned
short
>
,
min_max_caller
<
short
>
,
min_max_caller
<
int
>
,
min_max_caller
<
float
>
,
min_max_caller
<
double
>
}
};
static
const
MaskedCaller
masked_callers
[
2
][
7
]
=
{
{
min_max_mask_multipass_caller
<
unsigned
char
>
,
min_max_mask_multipass_caller
<
char
>
,
min_max_mask_multipass_caller
<
unsigned
short
>
,
min_max_mask_multipass_caller
<
short
>
,
min_max_mask_multipass_caller
<
int
>
,
min_max_mask_multipass_caller
<
float
>
,
0
},
{
min_max_mask_caller
<
unsigned
char
>
,
min_max_mask_caller
<
char
>
,
min_max_mask_caller
<
unsigned
short
>
,
min_max_mask_caller
<
short
>
,
min_max_mask_caller
<
int
>
,
min_max_mask_caller
<
float
>
,
min_max_mask_caller
<
double
>
}
};
CV_Assert
(
src
.
channels
()
==
1
);
CV_Assert
(
mask
.
empty
()
||
(
mask
.
type
()
==
CV_8U
&&
src
.
size
()
==
mask
.
size
()));
CV_Assert
(
src
.
type
()
!=
CV_64F
||
hasNativeDoubleSupport
(
getDevice
()));
double
minVal_
;
if
(
!
minVal
)
minVal
=
&
minVal_
;
double
maxVal_
;
if
(
!
maxVal
)
maxVal
=
&
maxVal_
;
Size
bufSize
;
get_buf_size_required
(
src
.
cols
,
src
.
rows
,
src
.
elemSize
(),
bufSize
.
width
,
bufSize
.
height
);
buf
.
create
(
bufSize
,
CV_8U
);
if
(
mask
.
empty
())
{
Caller
caller
=
callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
type
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"minMax: unsupported type"
);
caller
(
src
,
minVal
,
maxVal
,
buf
);
}
else
{
MaskedCaller
caller
=
masked_callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
type
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"minMax: unsupported type"
);
caller
(
src
,
mask
,
minVal
,
maxVal
,
buf
);
}
}
////////////////////////////////////////////////////////////////////////
// Locate min and max
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
{
namespace
minmaxloc
{
void
get_buf_size_required
(
int
cols
,
int
rows
,
int
elem_size
,
int
&
b1cols
,
int
&
b1rows
,
int
&
b2cols
,
int
&
b2rows
);
template
<
typename
T
>
void
min_max_loc_caller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
int
minloc
[
2
],
int
maxloc
[
2
],
PtrStep
valbuf
,
PtrStep
locbuf
);
template
<
typename
T
>
void
min_max_loc_mask_caller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
int
minloc
[
2
],
int
maxloc
[
2
],
PtrStep
valbuf
,
PtrStep
locbuf
);
template
<
typename
T
>
void
min_max_loc_multipass_caller
(
const
DevMem2D
src
,
double
*
minval
,
double
*
maxval
,
int
minloc
[
2
],
int
maxloc
[
2
],
PtrStep
valbuf
,
PtrStep
locbuf
);
template
<
typename
T
>
void
min_max_loc_mask_multipass_caller
(
const
DevMem2D
src
,
const
PtrStep
mask
,
double
*
minval
,
double
*
maxval
,
int
minloc
[
2
],
int
maxloc
[
2
],
PtrStep
valbuf
,
PtrStep
locbuf
);
}}}}
void
cv
::
gpu
::
minMaxLoc
(
const
GpuMat
&
src
,
double
*
minVal
,
double
*
maxVal
,
Point
*
minLoc
,
Point
*
maxLoc
,
const
GpuMat
&
mask
)
{
GpuMat
valbuf
,
locbuf
;
minMaxLoc
(
src
,
minVal
,
maxVal
,
minLoc
,
maxLoc
,
mask
,
valbuf
,
locbuf
);
}
void
cv
::
gpu
::
minMaxLoc
(
const
GpuMat
&
src
,
double
*
minVal
,
double
*
maxVal
,
Point
*
minLoc
,
Point
*
maxLoc
,
const
GpuMat
&
mask
,
GpuMat
&
valbuf
,
GpuMat
&
locbuf
)
{
using
namespace
mathfunc
::
minmaxloc
;
typedef
void
(
*
Caller
)(
const
DevMem2D
,
double
*
,
double
*
,
int
[
2
],
int
[
2
],
PtrStep
,
PtrStep
);
typedef
void
(
*
MaskedCaller
)(
const
DevMem2D
,
const
PtrStep
,
double
*
,
double
*
,
int
[
2
],
int
[
2
],
PtrStep
,
PtrStep
);
static
const
Caller
callers
[
2
][
7
]
=
{
{
min_max_loc_multipass_caller
<
unsigned
char
>
,
min_max_loc_multipass_caller
<
char
>
,
min_max_loc_multipass_caller
<
unsigned
short
>
,
min_max_loc_multipass_caller
<
short
>
,
min_max_loc_multipass_caller
<
int
>
,
min_max_loc_multipass_caller
<
float
>
,
0
},
{
min_max_loc_caller
<
unsigned
char
>
,
min_max_loc_caller
<
char
>
,
min_max_loc_caller
<
unsigned
short
>
,
min_max_loc_caller
<
short
>
,
min_max_loc_caller
<
int
>
,
min_max_loc_caller
<
float
>
,
min_max_loc_caller
<
double
>
}
};
static
const
MaskedCaller
masked_callers
[
2
][
7
]
=
{
{
min_max_loc_mask_multipass_caller
<
unsigned
char
>
,
min_max_loc_mask_multipass_caller
<
char
>
,
min_max_loc_mask_multipass_caller
<
unsigned
short
>
,
min_max_loc_mask_multipass_caller
<
short
>
,
min_max_loc_mask_multipass_caller
<
int
>
,
min_max_loc_mask_multipass_caller
<
float
>
,
0
},
{
min_max_loc_mask_caller
<
unsigned
char
>
,
min_max_loc_mask_caller
<
char
>
,
min_max_loc_mask_caller
<
unsigned
short
>
,
min_max_loc_mask_caller
<
short
>
,
min_max_loc_mask_caller
<
int
>
,
min_max_loc_mask_caller
<
float
>
,
min_max_loc_mask_caller
<
double
>
}
};
CV_Assert
(
src
.
channels
()
==
1
);
CV_Assert
(
mask
.
empty
()
||
(
mask
.
type
()
==
CV_8U
&&
src
.
size
()
==
mask
.
size
()));
CV_Assert
(
src
.
type
()
!=
CV_64F
||
hasNativeDoubleSupport
(
getDevice
()));
double
minVal_
;
if
(
!
minVal
)
minVal
=
&
minVal_
;
double
maxVal_
;
if
(
!
maxVal
)
maxVal
=
&
maxVal_
;
int
minLoc_
[
2
];
int
maxLoc_
[
2
];
Size
valbuf_size
,
locbuf_size
;
get_buf_size_required
(
src
.
cols
,
src
.
rows
,
src
.
elemSize
(),
valbuf_size
.
width
,
valbuf_size
.
height
,
locbuf_size
.
width
,
locbuf_size
.
height
);
valbuf
.
create
(
valbuf_size
,
CV_8U
);
locbuf
.
create
(
locbuf_size
,
CV_8U
);
if
(
mask
.
empty
())
{
Caller
caller
=
callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
type
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"minMaxLoc: unsupported type"
);
caller
(
src
,
minVal
,
maxVal
,
minLoc_
,
maxLoc_
,
valbuf
,
locbuf
);
}
else
{
MaskedCaller
caller
=
masked_callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
type
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"minMaxLoc: unsupported type"
);
caller
(
src
,
mask
,
minVal
,
maxVal
,
minLoc_
,
maxLoc_
,
valbuf
,
locbuf
);
}
if
(
minLoc
)
{
minLoc
->
x
=
minLoc_
[
0
];
minLoc
->
y
=
minLoc_
[
1
];
}
if
(
maxLoc
)
{
maxLoc
->
x
=
maxLoc_
[
0
];
maxLoc
->
y
=
maxLoc_
[
1
];
}
}
//////////////////////////////////////////////////////////////////////////////
// Count non-zero elements
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
{
namespace
countnonzero
{
void
get_buf_size_required
(
int
cols
,
int
rows
,
int
&
bufcols
,
int
&
bufrows
);
template
<
typename
T
>
int
count_non_zero_caller
(
const
DevMem2D
src
,
PtrStep
buf
);
template
<
typename
T
>
int
count_non_zero_multipass_caller
(
const
DevMem2D
src
,
PtrStep
buf
);
}}}}
int
cv
::
gpu
::
countNonZero
(
const
GpuMat
&
src
)
{
GpuMat
buf
;
return
countNonZero
(
src
,
buf
);
}
int
cv
::
gpu
::
countNonZero
(
const
GpuMat
&
src
,
GpuMat
&
buf
)
{
using
namespace
mathfunc
::
countnonzero
;
typedef
int
(
*
Caller
)(
const
DevMem2D
src
,
PtrStep
buf
);
static
const
Caller
callers
[
2
][
7
]
=
{
{
count_non_zero_multipass_caller
<
unsigned
char
>
,
count_non_zero_multipass_caller
<
char
>
,
count_non_zero_multipass_caller
<
unsigned
short
>
,
count_non_zero_multipass_caller
<
short
>
,
count_non_zero_multipass_caller
<
int
>
,
count_non_zero_multipass_caller
<
float
>
,
0
},
{
count_non_zero_caller
<
unsigned
char
>
,
count_non_zero_caller
<
char
>
,
count_non_zero_caller
<
unsigned
short
>
,
count_non_zero_caller
<
short
>
,
count_non_zero_caller
<
int
>
,
count_non_zero_caller
<
float
>
,
count_non_zero_caller
<
double
>
}
};
CV_Assert
(
src
.
channels
()
==
1
);
CV_Assert
(
src
.
type
()
!=
CV_64F
||
hasNativeDoubleSupport
(
getDevice
()));
Size
buf_size
;
get_buf_size_required
(
src
.
cols
,
src
.
rows
,
buf_size
.
width
,
buf_size
.
height
);
buf
.
create
(
buf_size
,
CV_8U
);
Caller
caller
=
callers
[
hasAtomicsSupport
(
getDevice
())][
src
.
type
()];
if
(
!
caller
)
CV_Error
(
CV_StsBadArg
,
"countNonZero: unsupported type"
);
return
caller
(
src
,
buf
);
}
#endif
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment