Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
deac5d97
Commit
deac5d97
authored
Feb 14, 2011
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fixed errors in gpu on old video cards (SURF_GPU, BruteForceMatcher_GPU, min/max, setTo, convertTo)
added assertion after all kernels calls
parent
5f175f95
Hide whitespace changes
Inline
Side-by-side
Showing
31 changed files
with
1363 additions
and
720 deletions
+1363
-720
gpu.hpp
modules/gpu/include/opencv2/gpu/gpu.hpp
+2
-2
brute_force_matcher.cpp
modules/gpu/src/brute_force_matcher.cpp
+29
-18
brute_force_matcher.cu
modules/gpu/src/cuda/brute_force_matcher.cu
+74
-62
color.cu
modules/gpu/src/cuda/color.cu
+22
-8
element_operations.cu
modules/gpu/src/cuda/element_operations.cu
+137
-67
filters.cu
modules/gpu/src/cuda/filters.cu
+7
-186
hog.cu
modules/gpu/src/cuda/hog.cu
+17
-0
imgproc.cu
modules/gpu/src/cuda/imgproc.cu
+25
-0
match_template.cu
modules/gpu/src/cuda/match_template.cu
+32
-0
mathfunc.cu
modules/gpu/src/cuda/mathfunc.cu
+2
-0
matrix_operations.cu
modules/gpu/src/cuda/matrix_operations.cu
+274
-255
matrix_reductions.cu
modules/gpu/src/cuda/matrix_reductions.cu
+79
-0
split_merge.cu
modules/gpu/src/cuda/split_merge.cu
+12
-0
stereobm.cu
modules/gpu/src/cuda/stereobm.cu
+4
-0
stereobp.cu
modules/gpu/src/cuda/stereobp.cu
+13
-0
stereocsbp.cu
modules/gpu/src/cuda/stereocsbp.cu
+15
-5
surf.cu
modules/gpu/src/cuda/surf.cu
+298
-0
cudastream.cpp
modules/gpu/src/cudastream.cpp
+36
-8
element_operations.cpp
modules/gpu/src/element_operations.cpp
+59
-38
imgproc_gpu.cpp
modules/gpu/src/imgproc_gpu.cpp
+4
-0
matrix_operations.cpp
modules/gpu/src/matrix_operations.cpp
+24
-18
mssegmentation.cpp
modules/gpu/src/mssegmentation.cpp
+2
-0
border_interpolate.hpp
modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
+97
-26
transform.hpp
modules/gpu/src/opencv2/gpu/device/transform.hpp
+4
-0
surf.cpp
modules/gpu/src/surf.cpp
+20
-5
brute_force_matcher.cpp
tests/gpu/src/brute_force_matcher.cpp
+26
-9
features2d.cpp
tests/gpu/src/features2d.cpp
+13
-4
gputest_main.cpp
tests/gpu/src/gputest_main.cpp
+6
-6
meanshift.cpp
tests/gpu/src/meanshift.cpp
+19
-0
mssegmentation.cpp
tests/gpu/src/mssegmentation.cpp
+8
-0
operator_convert_to.cpp
tests/gpu/src/operator_convert_to.cpp
+3
-3
No files found.
modules/gpu/include/opencv2/gpu/gpu.hpp
View file @
deac5d97
...
...
@@ -435,8 +435,8 @@ namespace cv
void
enqueueCopy
(
const
GpuMat
&
src
,
GpuMat
&
dst
);
void
enqueueMemSet
(
const
GpuMat
&
src
,
Scalar
val
);
void
enqueueMemSet
(
const
GpuMat
&
src
,
Scalar
val
,
const
GpuMat
&
mask
);
void
enqueueMemSet
(
GpuMat
&
src
,
Scalar
val
);
void
enqueueMemSet
(
GpuMat
&
src
,
Scalar
val
,
const
GpuMat
&
mask
);
// converts matrix type, ex from float to uchar depending on type
void
enqueueConvert
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
int
type
,
double
a
=
1
,
double
b
=
0
);
...
...
modules/gpu/src/brute_force_matcher.cpp
View file @
deac5d97
...
...
@@ -76,18 +76,22 @@ namespace cv { namespace gpu { namespace bfmatcher
{
template
<
typename
T
>
void
matchSingleL1_gpu
(
const
DevMem2D
&
queryDescs
,
const
DevMem2D
&
trainDescs
,
const
DevMem2D
&
mask
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
);
const
DevMem2D
&
mask
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
,
bool
cc_12
);
template
<
typename
T
>
void
matchSingleL2_gpu
(
const
DevMem2D
&
queryDescs
,
const
DevMem2D
&
trainDescs
,
const
DevMem2D
&
mask
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
);
const
DevMem2D
&
mask
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
,
bool
cc_12
);
template
<
typename
T
>
void
matchCollectionL1_gpu
(
const
DevMem2D
&
queryDescs
,
const
DevMem2D
&
trainCollection
,
const
DevMem2D_
<
PtrStep
>&
maskCollection
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
);
const
DevMem2Df
&
distance
,
bool
cc_12
);
template
<
typename
T
>
void
matchCollectionL2_gpu
(
const
DevMem2D
&
queryDescs
,
const
DevMem2D
&
trainCollection
,
const
DevMem2D_
<
PtrStep
>&
maskCollection
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
);
const
DevMem2Df
&
distance
,
bool
cc_12
);
template
<
typename
T
>
void
knnMatchL1_gpu
(
const
DevMem2D
&
queryDescs
,
const
DevMem2D
&
trainDescs
,
int
knn
,
...
...
@@ -160,17 +164,20 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs,
using
namespace
cv
::
gpu
::
bfmatcher
;
typedef
void
(
*
match_caller_t
)(
const
DevMem2D
&
queryDescs
,
const
DevMem2D
&
trainDescs
,
const
DevMem2D
&
mask
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
);
const
DevMem2D
&
mask
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
,
bool
cc_12
);
static
const
match_caller_t
match_callers
[
2
][
8
]
=
{
{
matchSingleL1_gpu
<
unsigned
char
>
,
matchSingleL1_gpu
<
char
>
,
matchSingleL1_gpu
<
unsigned
short
>
,
matchSingleL1_gpu
<
short
>
,
matchSingleL1_gpu
<
int
>
,
matchSingleL1_gpu
<
float
>
,
0
,
0
matchSingleL1_gpu
<
unsigned
char
>
,
matchSingleL1_gpu
<
signed
char
>
,
matchSingleL1_gpu
<
unsigned
short
>
,
matchSingleL1_gpu
<
short
>
,
matchSingleL1_gpu
<
int
>
,
matchSingleL1_gpu
<
float
>
,
0
,
0
},
{
matchSingleL2_gpu
<
unsigned
char
>
,
matchSingleL2_gpu
<
char
>
,
matchSingleL2_gpu
<
unsigned
short
>
,
matchSingleL2_gpu
<
short
>
,
matchSingleL2_gpu
<
int
>
,
matchSingleL2_gpu
<
float
>
,
0
,
0
matchSingleL2_gpu
<
unsigned
char
>
,
matchSingleL2_gpu
<
signed
char
>
,
matchSingleL2_gpu
<
unsigned
short
>
,
matchSingleL2_gpu
<
short
>
,
matchSingleL2_gpu
<
int
>
,
matchSingleL2_gpu
<
float
>
,
0
,
0
}
};
...
...
@@ -185,9 +192,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs,
match_caller_t
func
=
match_callers
[
distType
][
queryDescs
.
depth
()];
CV_Assert
(
func
!=
0
);
bool
cc_12
=
TargetArchs
::
builtWith
(
COMPUTE_12
)
&&
DeviceInfo
().
supports
(
COMPUTE_12
);
// For single train there is no need to save imgIdx, so we just save imgIdx to trainIdx.
// trainIdx store after imgIdx, so we doesn't lose it value.
func
(
queryDescs
,
trainDescs
,
mask
,
trainIdx
,
trainIdx
,
distance
);
func
(
queryDescs
,
trainDescs
,
mask
,
trainIdx
,
trainIdx
,
distance
,
cc_12
);
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchDownload
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
distance
,
...
...
@@ -284,17 +293,17 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes
typedef
void
(
*
match_caller_t
)(
const
DevMem2D
&
queryDescs
,
const
DevMem2D
&
trainCollection
,
const
DevMem2D_
<
PtrStep
>&
maskCollection
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
);
const
DevMem2Df
&
distance
,
bool
cc_12
);
static
const
match_caller_t
match_callers
[
2
][
8
]
=
{
{
matchCollectionL1_gpu
<
unsigned
char
>
,
matchCollectionL1_gpu
<
char
>
,
matchCollectionL1_gpu
<
unsigned
char
>
,
matchCollectionL1_gpu
<
signed
char
>
,
matchCollectionL1_gpu
<
unsigned
short
>
,
matchCollectionL1_gpu
<
short
>
,
matchCollectionL1_gpu
<
int
>
,
matchCollectionL1_gpu
<
float
>
,
0
,
0
},
{
matchCollectionL2_gpu
<
unsigned
char
>
,
matchCollectionL2_gpu
<
char
>
,
matchCollectionL2_gpu
<
unsigned
char
>
,
matchCollectionL2_gpu
<
signed
char
>
,
matchCollectionL2_gpu
<
unsigned
short
>
,
matchCollectionL2_gpu
<
short
>
,
matchCollectionL2_gpu
<
int
>
,
matchCollectionL2_gpu
<
float
>
,
0
,
0
}
...
...
@@ -311,7 +320,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes
match_caller_t
func
=
match_callers
[
distType
][
queryDescs
.
depth
()];
CV_Assert
(
func
!=
0
);
func
(
queryDescs
,
trainCollection
,
maskCollection
,
trainIdx
,
imgIdx
,
distance
);
bool
cc_12
=
TargetArchs
::
builtWith
(
COMPUTE_12
)
&&
DeviceInfo
().
supports
(
COMPUTE_12
);
func
(
queryDescs
,
trainCollection
,
maskCollection
,
trainIdx
,
imgIdx
,
distance
,
cc_12
);
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchDownload
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
imgIdx
,
...
...
@@ -383,11 +394,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
static
const
match_caller_t
match_callers
[
2
][
8
]
=
{
{
knnMatchL1_gpu
<
unsigned
char
>
,
knnMatchL1_gpu
<
char
>
,
knnMatchL1_gpu
<
unsigned
short
>
,
knnMatchL1_gpu
<
unsigned
char
>
,
knnMatchL1_gpu
<
signed
char
>
,
knnMatchL1_gpu
<
unsigned
short
>
,
knnMatchL1_gpu
<
short
>
,
knnMatchL1_gpu
<
int
>
,
knnMatchL1_gpu
<
float
>
,
0
,
0
},
{
knnMatchL2_gpu
<
unsigned
char
>
,
knnMatchL2_gpu
<
char
>
,
knnMatchL2_gpu
<
unsigned
short
>
,
knnMatchL2_gpu
<
unsigned
char
>
,
knnMatchL2_gpu
<
signed
char
>
,
knnMatchL2_gpu
<
unsigned
short
>
,
knnMatchL2_gpu
<
short
>
,
knnMatchL2_gpu
<
int
>
,
knnMatchL2_gpu
<
float
>
,
0
,
0
}
};
...
...
@@ -522,11 +533,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
static
const
radiusMatch_caller_t
radiusMatch_callers
[
2
][
8
]
=
{
{
radiusMatchL1_gpu
<
unsigned
char
>
,
radiusMatchL1_gpu
<
char
>
,
radiusMatchL1_gpu
<
unsigned
short
>
,
radiusMatchL1_gpu
<
unsigned
char
>
,
radiusMatchL1_gpu
<
signed
char
>
,
radiusMatchL1_gpu
<
unsigned
short
>
,
radiusMatchL1_gpu
<
short
>
,
radiusMatchL1_gpu
<
int
>
,
radiusMatchL1_gpu
<
float
>
,
0
,
0
},
{
radiusMatchL2_gpu
<
unsigned
char
>
,
radiusMatchL2_gpu
<
char
>
,
radiusMatchL2_gpu
<
unsigned
short
>
,
radiusMatchL2_gpu
<
unsigned
char
>
,
radiusMatchL2_gpu
<
signed
char
>
,
radiusMatchL2_gpu
<
unsigned
short
>
,
radiusMatchL2_gpu
<
short
>
,
radiusMatchL2_gpu
<
int
>
,
radiusMatchL2_gpu
<
float
>
,
0
,
0
}
};
...
...
modules/gpu/src/cuda/brute_force_matcher.cu
View file @
deac5d97
...
...
@@ -555,6 +555,7 @@ namespace cv { namespace gpu { namespace bfmatcher
match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSimple<BLOCK_DIM_X, T>, Dist, T>
<<<grid, threads>>>(queryDescs, train, mask, trainIdx.data,
imgIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
...
...
@@ -575,6 +576,7 @@ namespace cv { namespace gpu { namespace bfmatcher
Dist, T>
<<<grid, threads>>>(queryDescs, train, mask, trainIdx.data,
imgIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
...
...
@@ -584,7 +586,8 @@ namespace cv { namespace gpu { namespace bfmatcher
template <typename Dist, typename T, typename Train, typename Mask>
void match_chooser(const DevMem2D_<T>& queryDescs, const Train& train,
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12)
{
if (queryDescs.cols < 64)
matchCached_caller<16, 16, 64, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
...
...
@@ -596,7 +599,7 @@ namespace cv { namespace gpu { namespace bfmatcher
matchCached_caller<16, 16, 128, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
else if (queryDescs.cols < 256)
matchCached_caller<16, 16, 256, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
else if (queryDescs.cols == 256)
else if (queryDescs.cols == 256
&& cc_12
)
matchCached_caller<16, 16, 256, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
else
matchSimple_caller<16, 16, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
...
...
@@ -606,95 +609,99 @@ namespace cv { namespace gpu { namespace bfmatcher
template <typename T>
void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12)
{
SingleTrain<T> train((DevMem2D_<T>)trainDescs);
if (mask.data)
{
SingleMask m(mask);
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance);
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance
, cc_12
);
}
else
{
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance
, cc_12
);
}
}
template void matchSingleL1_gpu<u
nsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchSingleL1_gpu<
char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchSingleL1_gpu<u
nsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchSingleL1_gpu<short
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchSingleL1_gpu<int
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchSingleL1_gpu<float
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchSingleL1_gpu<u
char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
template void matchSingleL1_gpu<
schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
template void matchSingleL1_gpu<u
short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
template void matchSingleL1_gpu<short
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
template void matchSingleL1_gpu<int
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
template void matchSingleL1_gpu<float
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
template <typename T>
void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
bool cc_12)
{
SingleTrain<T> train((DevMem2D_<T>)trainDescs);
if (mask.data)
{
SingleMask m(mask);
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance);
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance
, cc_12
);
}
else
{
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance
, cc_12
);
}
}
template void matchSingleL2_gpu<u
nsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchSingleL2_gpu<
char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchSingleL2_gpu<u
nsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchSingleL2_gpu<short
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchSingleL2_gpu<int
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchSingleL2_gpu<float
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchSingleL2_gpu<u
char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
template void matchSingleL2_gpu<
schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
template void matchSingleL2_gpu<u
short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
template void matchSingleL2_gpu<short
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
template void matchSingleL2_gpu<int
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
template void matchSingleL2_gpu<float
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
template <typename T>
void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
const DevMem2Df& distance, bool cc_12)
{
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);
if (maskCollection.data)
{
MaskCollection mask(maskCollection.data);
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance);
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance
, cc_12
);
}
else
{
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance
, cc_12
);
}
}
template void matchCollectionL1_gpu<u
nsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchCollectionL1_gpu<
char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchCollectionL1_gpu<u
nsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchCollectionL1_gpu<short
>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchCollectionL1_gpu<int
>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchCollectionL1_gpu<float
>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchCollectionL1_gpu<u
char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
template void matchCollectionL1_gpu<
schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
template void matchCollectionL1_gpu<u
short>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
template void matchCollectionL1_gpu<short
>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
template void matchCollectionL1_gpu<int
>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
template void matchCollectionL1_gpu<float
>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
template <typename T>
void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
const DevMem2Df& distance, bool cc_12)
{
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);
if (maskCollection.data)
{
MaskCollection mask(maskCollection.data);
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance);
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance
, cc_12
);
}
else
{
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance
, cc_12
);
}
}
template void matchCollectionL2_gpu<u
nsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchCollectionL2_gpu<
char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchCollectionL2_gpu<u
nsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchCollectionL2_gpu<short
>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchCollectionL2_gpu<int
>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchCollectionL2_gpu<float
>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance
);
template void matchCollectionL2_gpu<u
char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
template void matchCollectionL2_gpu<
schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
template void matchCollectionL2_gpu<u
short>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
template void matchCollectionL2_gpu<short
>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
template void matchCollectionL2_gpu<int
>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
template void matchCollectionL2_gpu<float
>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, bool cc_12
);
///////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////// Knn Match ////////////////////////////////////
...
...
@@ -748,6 +755,7 @@ namespace cv { namespace gpu { namespace bfmatcher
calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads>>>(
queryDescs, trainDescs, mask, distance);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
...
...
@@ -923,7 +931,10 @@ namespace cv { namespace gpu { namespace bfmatcher
dim3 grid(trainIdx.rows, 1, 1);
for (int i = 0; i < knn; ++i)
{
findBestMatch<BLOCK_SIZE><<<grid, threads>>>(allDist, i, trainIdx, distance);
cudaSafeCall( cudaGetLastError() );
}
cudaSafeCall( cudaThreadSynchronize() );
}
...
...
@@ -949,12 +960,12 @@ namespace cv { namespace gpu { namespace bfmatcher
findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);
}
template void knnMatchL1_gpu<u
nsigned
char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL1_gpu<
char
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL1_gpu<u
nsigned
short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL1_gpu<short
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL1_gpu<int
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL1_gpu<float
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL1_gpu<
schar
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template <typename T>
void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
...
...
@@ -974,12 +985,12 @@ namespace cv { namespace gpu { namespace bfmatcher
findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);
}
template void knnMatchL2_gpu<u
nsigned
char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL2_gpu<
char
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL2_gpu<u
nsigned
short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL2_gpu<short
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL2_gpu<int
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL2_gpu<float
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL2_gpu<
schar
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
///////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////// Radius Match //////////////////////////////////
...
...
@@ -1044,6 +1055,7 @@ namespace cv { namespace gpu { namespace bfmatcher
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads>>>(
queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches, distance);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
...
...
@@ -1067,12 +1079,12 @@ namespace cv { namespace gpu { namespace bfmatcher
}
}
template void radiusMatchL1_gpu<u
nsigned
char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL1_gpu<
char
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL1_gpu<u
nsigned
short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL1_gpu<short
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL1_gpu<int
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL1_gpu<float
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL1_gpu<
schar
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template <typename T>
void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
...
...
@@ -1090,10 +1102,10 @@ namespace cv { namespace gpu { namespace bfmatcher
}
}
template void radiusMatchL2_gpu<u
nsigned
char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<
char
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<u
nsigned
short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<short
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<int
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<float
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<
schar
>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
}}}
modules/gpu/src/cuda/color.cu
View file @
deac5d97
...
...
@@ -43,6 +43,7 @@
#include "internal_shared.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/vecmath.hpp"
#include "opencv2/gpu/device/limits_gpu.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
...
...
@@ -51,13 +52,9 @@ using namespace cv::gpu::device;
#define CV_DESCALE(x, n) (((x) + (1 << ((n)-1))) >> (n))
#endif
#ifndef FLT_EPSILON
#define FLT_EPSILON 1.192092896e-07F
#endif
namespace cv { namespace gpu { namespace color
{
template<typename T> struct ColorChannel
{}
;
template<typename T> struct ColorChannel;
template<> struct ColorChannel<uchar>
{
typedef float worktype_f;
...
...
@@ -133,6 +130,7 @@ namespace cv { namespace gpu { namespace color
RGB2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -276,6 +274,7 @@ namespace cv { namespace gpu { namespace color
RGB5x52RGB<GREEN_BITS, DSTCN><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -304,6 +303,7 @@ namespace cv { namespace gpu { namespace color
RGB2RGB5x5<SRCCN, GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -385,6 +385,7 @@ namespace cv { namespace gpu { namespace color
Gray2RGB<DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -425,6 +426,7 @@ namespace cv { namespace gpu { namespace color
Gray2RGB5x5<GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -533,6 +535,7 @@ namespace cv { namespace gpu { namespace color
RGB2Gray<SRCCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -573,6 +576,7 @@ namespace cv { namespace gpu { namespace color
RGB5x52Gray<GREEN_BITS><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -698,6 +702,7 @@ namespace cv { namespace gpu { namespace color
RGB2YCrCb<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -756,6 +761,7 @@ namespace cv { namespace gpu { namespace color
YCrCb2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -902,6 +908,7 @@ namespace cv { namespace gpu { namespace color
RGB2XYZ<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -960,6 +967,7 @@ namespace cv { namespace gpu { namespace color
XYZ2RGB<SRCCN, DSTCN, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -1063,8 +1071,8 @@ namespace cv { namespace gpu { namespace color
vmin = fmin(vmin, b);
diff = v - vmin;
s = diff / (float)(fabs(v) +
FLT_EPSILON
);
diff = (float)(60. / (diff +
FLT_EPSILON
));
s = diff / (float)(fabs(v) +
numeric_limits_gpu<float>::epsilon()
);
diff = (float)(60. / (diff +
numeric_limits_gpu<float>::epsilon()
));
if (v == r)
h = (g - b) * diff;
...
...
@@ -1199,6 +1207,8 @@ namespace cv { namespace gpu { namespace color
RGB2HSV<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
}
...
...
@@ -1281,6 +1291,8 @@ namespace cv { namespace gpu { namespace color
HSV2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
}
...
...
@@ -1342,7 +1354,7 @@ namespace cv { namespace gpu { namespace color
diff = vmax - vmin;
l = (vmax + vmin) * 0.5f;
if (diff >
FLT_EPSILON
)
if (diff >
numeric_limits_gpu<float>::epsilon()
)
{
s = l < 0.5f ? diff / (vmax + vmin) : diff / (2.0f - vmax - vmin);
diff = 60.f / diff;
...
...
@@ -1550,6 +1562,8 @@ namespace cv { namespace gpu { namespace color
HLS2RGB<SRCCN, DSTCN, 255, T><<<grid, threads, 0, stream>>>(src.data, src.step,
dst.data, dst.step, src.rows, src.cols, bidx);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
}
...
...
modules/gpu/src/cuda/element_operations.cu
View file @
deac5d97
...
...
@@ -130,6 +130,7 @@ namespace cv { namespace gpu { namespace mathfunc
divUp(rows, threads.y));
bitwiseUnOpKernel<opid><<<grid, threads>>>(rows, width, src, dst);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
...
...
@@ -161,6 +162,7 @@ namespace cv { namespace gpu { namespace mathfunc
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
bitwiseUnOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src, mask, dst);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
...
...
@@ -251,6 +253,7 @@ namespace cv { namespace gpu { namespace mathfunc
dim3 grid(divUp(width, threads.x * sizeof(uint)), divUp(rows, threads.y));
bitwiseBinOpKernel<opid><<<grid, threads>>>(rows, width, src1, src2, dst);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
...
...
@@ -283,7 +286,8 @@ namespace cv { namespace gpu { namespace mathfunc
dim3 threads(16, 16);
dim3 grid(divUp(cols, threads.x), divUp(rows, threads.y));
bitwiseBinOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src1, src2, mask, dst);
bitwiseBinOpKernel<T, opid><<<grid, threads>>>(rows, cols, cn, src1, src2, mask, dst);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
...
...
@@ -384,29 +388,71 @@ namespace cv { namespace gpu { namespace mathfunc
}
};
struct ScalarMinOp
template <typename T> struct ScalarMinOp
{
T s;
explicit ScalarMinOp(T s_) : s(s_) {}
__device__ T operator()(T a)
{
return min(a, s);
}
};
template <> struct ScalarMinOp<float>
{
float s;
explicit ScalarMinOp(float s_) : s(s_) {}
__device__ float operator()(float a)
{
return fmin(a, s);
}
};
template <> struct ScalarMinOp<double>
{
double s;
explicit ScalarMinOp(double s_) : s(s_) {}
template <typename T>
__device__ T operator()(T a)
__device__ double operator()(double a)
{
return
saturate_cast<T>(fmin((double)a, s)
);
return
fmin(a, s
);
}
};
struct ScalarMaxOp
template <typename T> struct ScalarMaxOp
{
T s;
explicit ScalarMaxOp(T s_) : s(s_) {}
__device__ T operator()(T a)
{
return max(a, s);
}
};
template <> struct ScalarMaxOp<float>
{
float s;
explicit ScalarMaxOp(float s_) : s(s_) {}
__device__ float operator()(float a)
{
return fmax(a, s);
}
};
template <> struct ScalarMaxOp<double>
{
double s;
explicit ScalarMaxOp(double s_) : s(s_) {}
template <typename T>
__device__ T operator()(T a)
__device__ double operator()(double a)
{
return
saturate_cast<T>(fmax((double)a, s)
);
return
fmax(a, s
);
}
};
...
...
@@ -418,7 +464,7 @@ namespace cv { namespace gpu { namespace mathfunc
}
template void min_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
template void min_gpu<
char >(const DevMem2D_<char>& src1, const DevMem2D_<char>& src2, const DevMem2D_<
char>& dst, cudaStream_t stream);
template void min_gpu<
schar >(const DevMem2D_<schar>& src1, const DevMem2D_<schar>& src2, const DevMem2D_<s
char>& dst, cudaStream_t stream);
template void min_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
template void min_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
template void min_gpu<int >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
...
...
@@ -433,7 +479,7 @@ namespace cv { namespace gpu { namespace mathfunc
}
template void max_gpu<uchar >(const DevMem2D& src1, const DevMem2D& src2, const DevMem2D& dst, cudaStream_t stream);
template void max_gpu<
char >(const DevMem2D_<char>& src1, const DevMem2D_<char>& src2, const DevMem2D_<
char>& dst, cudaStream_t stream);
template void max_gpu<
schar >(const DevMem2D_<schar>& src1, const DevMem2D_<schar>& src2, const DevMem2D_<s
char>& dst, cudaStream_t stream);
template void max_gpu<ushort>(const DevMem2D_<ushort>& src1, const DevMem2D_<ushort>& src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
template void max_gpu<short >(const DevMem2D_<short>& src1, const DevMem2D_<short>& src2, const DevMem2D_<short>& dst, cudaStream_t stream);
template void max_gpu<int >(const DevMem2D_<int>& src1, const DevMem2D_<int>& src2, const DevMem2D_<int>& dst, cudaStream_t stream);
...
...
@@ -441,122 +487,145 @@ namespace cv { namespace gpu { namespace mathfunc
template void max_gpu<double>(const DevMem2D_<double>& src1, const DevMem2D_<double>& src2, const DevMem2D_<double>& dst, cudaStream_t stream);
template <typename T>
void min_gpu(const DevMem2D_<T>& src1,
double
src2, const DevMem2D_<T>& dst, cudaStream_t stream)
void min_gpu(const DevMem2D_<T>& src1,
T
src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
ScalarMinOp op(src2);
ScalarMinOp
<T>
op(src2);
transform(src1, dst, op, stream);
}
template void min_gpu<uchar >(const DevMem2D& src1,
double
src2, const DevMem2D& dst, cudaStream_t stream);
template void min_gpu<
char >(const DevMem2D_<char>& src1, double src2, const DevMem2D_<
char>& dst, cudaStream_t stream);
template void min_gpu<ushort>(const DevMem2D_<ushort>& src1,
double
src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
template void min_gpu<short >(const DevMem2D_<short>& src1,
double
src2, const DevMem2D_<short>& dst, cudaStream_t stream);
template void min_gpu<int >(const DevMem2D_<int>& src1,
double
src2, const DevMem2D_<int>& dst, cudaStream_t stream);
template void min_gpu<float >(const DevMem2D_<float>& src1,
double
src2, const DevMem2D_<float>& dst, cudaStream_t stream);
template void min_gpu<uchar >(const DevMem2D& src1,
uchar
src2, const DevMem2D& dst, cudaStream_t stream);
template void min_gpu<
schar >(const DevMem2D_<schar>& src1, schar src2, const DevMem2D_<s
char>& dst, cudaStream_t stream);
template void min_gpu<ushort>(const DevMem2D_<ushort>& src1,
ushort
src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
template void min_gpu<short >(const DevMem2D_<short>& src1,
short
src2, const DevMem2D_<short>& dst, cudaStream_t stream);
template void min_gpu<int >(const DevMem2D_<int>& src1,
int
src2, const DevMem2D_<int>& dst, cudaStream_t stream);
template void min_gpu<float >(const DevMem2D_<float>& src1,
float
src2, const DevMem2D_<float>& dst, cudaStream_t stream);
template void min_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
template <typename T>
void max_gpu(const DevMem2D_<T>& src1,
double
src2, const DevMem2D_<T>& dst, cudaStream_t stream)
void max_gpu(const DevMem2D_<T>& src1,
T
src2, const DevMem2D_<T>& dst, cudaStream_t stream)
{
ScalarMaxOp op(src2);
ScalarMaxOp
<T>
op(src2);
transform(src1, dst, op, stream);
}
template void max_gpu<uchar >(const DevMem2D& src1,
double
src2, const DevMem2D& dst, cudaStream_t stream);
template void max_gpu<
char >(const DevMem2D_<char>& src1, double src2, const DevMem2D_<
char>& dst, cudaStream_t stream);
template void max_gpu<ushort>(const DevMem2D_<ushort>& src1,
double
src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
template void max_gpu<short >(const DevMem2D_<short>& src1,
double
src2, const DevMem2D_<short>& dst, cudaStream_t stream);
template void max_gpu<int >(const DevMem2D_<int>& src1,
double
src2, const DevMem2D_<int>& dst, cudaStream_t stream);
template void max_gpu<float >(const DevMem2D_<float>& src1,
double
src2, const DevMem2D_<float>& dst, cudaStream_t stream);
template void max_gpu<uchar >(const DevMem2D& src1,
uchar
src2, const DevMem2D& dst, cudaStream_t stream);
template void max_gpu<
schar >(const DevMem2D_<schar>& src1, schar src2, const DevMem2D_<s
char>& dst, cudaStream_t stream);
template void max_gpu<ushort>(const DevMem2D_<ushort>& src1,
ushort
src2, const DevMem2D_<ushort>& dst, cudaStream_t stream);
template void max_gpu<short >(const DevMem2D_<short>& src1,
short
src2, const DevMem2D_<short>& dst, cudaStream_t stream);
template void max_gpu<int >(const DevMem2D_<int>& src1,
int
src2, const DevMem2D_<int>& dst, cudaStream_t stream);
template void max_gpu<float >(const DevMem2D_<float>& src1,
float
src2, const DevMem2D_<float>& dst, cudaStream_t stream);
template void max_gpu<double>(const DevMem2D_<double>& src1, double src2, const DevMem2D_<double>& dst, cudaStream_t stream);
//////////////////////////////////////////////////////////////////////////
// threshold
class ThreshOp
template <typename T> struct ThreshBinary
{
public:
ThreshOp(float thresh_, float maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
ThreshBinary(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
protected:
float thresh;
float maxVal;
__device__ T operator()(const T& src) const
{
return src > thresh ? maxVal : 0;
}
private:
T thresh;
T maxVal;
};
class ThreshBinary : public ThreshOp
template <typename T> struct ThreshBinaryInv
{
public:
ThreshBinary(float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal_) {}
ThreshBinaryInv(T thresh_, T maxVal_) : thresh(thresh_), maxVal(maxVal_) {}
template<typename T>
__device__ T operator()(const T& src) const
{
return
(float)src > thresh ? saturate_cast<T>(maxVal) : 0
;
return
src > thresh ? 0 : maxVal
;
}
private:
T thresh;
T maxVal;
};
class ThreshBinaryInv : public ThreshOp
template <typename T> struct ThreshTrunc
{
public:
ThreshBinaryInv(float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal_) {}
ThreshTrunc(T thresh_, T) : thresh(thresh_) {}
template<typename T>
__device__ T operator()(const T& src) const
{
return
(float)src > thresh ? 0 : saturate_cast<T>(maxVal
);
return
min(src, thresh
);
}
private:
T thresh;
};
template <> struct ThreshTrunc<float>
{
ThreshTrunc(float thresh_, float) : thresh(thresh_) {}
__device__ float operator()(const float& src) const
{
return fmin(src, thresh);
}
class ThreshTrunc : public ThreshOp
private:
float thresh;
};
template <> struct ThreshTrunc<double>
{
public:
ThreshTrunc(float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal_) {}
ThreshTrunc(double thresh_, double) : thresh(thresh_) {}
template<typename T>
__device__ T operator()(const T& src) const
__device__ double operator()(const double& src) const
{
return
saturate_cast<T>(fmin((float)src, thresh)
);
return
fmin(src, thresh
);
}
private:
double thresh;
};
class ThreshToZero : public ThreshOp
template <typename T> struct ThreshToZero
{
public:
ThreshToZero(
float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal
_) {}
ThreshToZero(
T thresh_, T) : thresh(thresh
_) {}
template<typename T>
__device__ T operator()(const T& src) const
{
return
(float)
src > thresh ? src : 0;
return src > thresh ? src : 0;
}
private:
T thresh;
};
class ThreshToZeroInv : public ThreshOp
template <typename T> struct ThreshToZeroInv
{
public:
ThreshToZeroInv(
float thresh_, float maxVal_) : ThreshOp(thresh_, maxVal
_) {}
ThreshToZeroInv(
T thresh_, T) : thresh(thresh
_) {}
template<typename T>
__device__ T operator()(const T& src) const
{
return
(float)
src > thresh ? 0 : src;
return src > thresh ? 0 : src;
}
private:
T thresh;
};
template <class Op, typename T>
void threshold_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst,
float thresh, float
maxVal,
template <
template <typename>
class Op, typename T>
void threshold_caller(const DevMem2D_<T>& src, const DevMem2D_<T>& dst,
T thresh, T
maxVal,
cudaStream_t stream)
{
Op op(thresh, maxVal);
Op
<T>
op(thresh, maxVal);
transform(src, dst, op, stream);
}
template <typename T>
void threshold_gpu(const DevMem2D& src, const DevMem2D& dst,
float thresh, float
maxVal, int type,
void threshold_gpu(const DevMem2D& src, const DevMem2D& dst,
T thresh, T
maxVal, int type,
cudaStream_t stream)
{
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<T>& dst,
float thresh, float
maxVal,
typedef void (*caller_t)(const DevMem2D_<T>& src, const DevMem2D_<T>& dst,
T thresh, T
maxVal,
cudaStream_t stream);
static const caller_t callers[] =
...
...
@@ -571,10 +640,11 @@ namespace cv { namespace gpu { namespace mathfunc
callers[type]((DevMem2D_<T>)src, (DevMem2D_<T>)dst, thresh, maxVal, stream);
}
template void threshold_gpu<uchar>(const DevMem2D& src, const DevMem2D& dst,
float thresh, float
maxVal, int type, cudaStream_t stream);
template void threshold_gpu<schar>(const DevMem2D& src, const DevMem2D& dst,
float thresh, float
maxVal, int type, cudaStream_t stream);
template void threshold_gpu<ushort>(const DevMem2D& src, const DevMem2D& dst,
float thresh, floa
t maxVal, int type, cudaStream_t stream);
template void threshold_gpu<short>(const DevMem2D& src, const DevMem2D& dst,
float thresh, floa
t maxVal, int type, cudaStream_t stream);
template void threshold_gpu<int>(const DevMem2D& src, const DevMem2D& dst,
float thresh, floa
t maxVal, int type, cudaStream_t stream);
template void threshold_gpu<uchar>(const DevMem2D& src, const DevMem2D& dst,
uchar thresh, uchar
maxVal, int type, cudaStream_t stream);
template void threshold_gpu<schar>(const DevMem2D& src, const DevMem2D& dst,
schar thresh, schar
maxVal, int type, cudaStream_t stream);
template void threshold_gpu<ushort>(const DevMem2D& src, const DevMem2D& dst,
ushort thresh, ushor
t maxVal, int type, cudaStream_t stream);
template void threshold_gpu<short>(const DevMem2D& src, const DevMem2D& dst,
short thresh, shor
t maxVal, int type, cudaStream_t stream);
template void threshold_gpu<int>(const DevMem2D& src, const DevMem2D& dst,
int thresh, in
t maxVal, int type, cudaStream_t stream);
template void threshold_gpu<float>(const DevMem2D& src, const DevMem2D& dst, float thresh, float maxVal, int type, cudaStream_t stream);
template void threshold_gpu<double>(const DevMem2D& src, const DevMem2D& dst, double thresh, double maxVal, int type, cudaStream_t stream);
}}}
modules/gpu/src/cuda/filters.cu
View file @
deac5d97
...
...
@@ -44,6 +44,7 @@
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/vecmath.hpp"
#include "opencv2/gpu/device/limits_gpu.hpp"
#include "opencv2/gpu/device/border_interpolate.hpp"
#include "safe_call.hpp"
#include "internal_shared.hpp"
...
...
@@ -51,192 +52,6 @@
using namespace cv::gpu;
using namespace cv::gpu::device;
namespace cv
{
namespace gpu
{
namespace device
{
struct BrdReflect101
{
explicit BrdReflect101(int len): last(len - 1) {}
__device__ int idx_low(int i) const
{
return abs(i);
}
__device__ int idx_high(int i) const
{
return last - abs(last - i);
}
__device__ int idx(int i) const
{
return abs(idx_high(i));
}
bool is_range_safe(int mini, int maxi) const
{
return -last <= mini && maxi <= 2 * last;
}
int last;
};
template <typename D>
struct BrdRowReflect101: BrdReflect101
{
explicit BrdRowReflect101(int len): BrdReflect101(len) {}
template <typename T>
__device__ D at_low(int i, const T* data) const
{
return saturate_cast<D>(data[idx_low(i)]);
}
template <typename T>
__device__ D at_high(int i, const T* data) const
{
return saturate_cast<D>(data[idx_high(i)]);
}
};
template <typename D>
struct BrdColReflect101: BrdReflect101
{
BrdColReflect101(int len, int step): BrdReflect101(len), step(step) {}
template <typename T>
__device__ D at_low(int i, const T* data) const
{
return saturate_cast<D>(data[idx_low(i) * step]);
}
template <typename T>
__device__ D at_high(int i, const T* data) const
{
return saturate_cast<D>(data[idx_high(i) * step]);
}
int step;
};
struct BrdReplicate
{
explicit BrdReplicate(int len): last(len - 1) {}
__device__ int idx_low(int i) const
{
return max(i, 0);
}
__device__ int idx_high(int i) const
{
return min(i, last);
}
__device__ int idx(int i) const
{
return max(min(i, last), 0);
}
bool is_range_safe(int mini, int maxi) const
{
return true;
}
int last;
};
template <typename D>
struct BrdRowReplicate: BrdReplicate
{
explicit BrdRowReplicate(int len): BrdReplicate(len) {}
template <typename T>
__device__ D at_low(int i, const T* data) const
{
return saturate_cast<D>(data[idx_low(i)]);
}
template <typename T>
__device__ D at_high(int i, const T* data) const
{
return saturate_cast<D>(data[idx_high(i)]);
}
};
template <typename D>
struct BrdColReplicate: BrdReplicate
{
BrdColReplicate(int len, int step): BrdReplicate(len), step(step) {}
template <typename T>
__device__ D at_low(int i, const T* data) const
{
return saturate_cast<D>(data[idx_low(i) * step]);
}
template <typename T>
__device__ D at_high(int i, const T* data) const
{
return saturate_cast<D>(data[idx_high(i) * step]);
}
int step;
};
template <typename D>
struct BrdRowConstant
{
explicit BrdRowConstant(int len_, const D& val_ = VecTraits<D>::all(0)): len(len_), val(val_) {}
template <typename T>
__device__ D at_low(int i, const T* data) const
{
return i >= 0 ? saturate_cast<D>(data[i]) : val;
}
template <typename T>
__device__ D at_high(int i, const T* data) const
{
return i < len ? saturate_cast<D>(data[i]) : val;
}
bool is_range_safe(int mini, int maxi) const
{
return true;
}
int len;
D val;
};
template <typename D>
struct BrdColConstant
{
BrdColConstant(int len_, int step_, const D& val_ = VecTraits<D>::all(0)): len(len_), step(step_), val(val_) {}
template <typename T>
__device__ D at_low(int i, const T* data) const
{
return i >= 0 ? saturate_cast<D>(data[i * step]) : val;
}
template <typename T>
__device__ D at_high(int i, const T* data) const
{
return i < len ? saturate_cast<D>(data[i * step]) : val;
}
bool is_range_safe(int mini, int maxi) const
{
return true;
}
int len;
int step;
D val;
};
}
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////
// Linear filters
...
...
@@ -329,6 +144,7 @@ namespace cv { namespace gpu { namespace filters
}
filter_krnls::linearRowFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
...
...
@@ -467,6 +283,7 @@ namespace cv { namespace gpu { namespace filters
}
filter_krnls::linearColumnFilter<ksize, T, D><<<grid, threads>>>(src, dst, anchor, b);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
...
...
@@ -705,14 +522,18 @@ namespace cv { namespace gpu { namespace bf
for (int i = 0; i < iters; ++i)
{
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
bf_krnls::bilateral_filter<1><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
}
break;
case 3:
for (int i = 0; i < iters; ++i)
{
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(0, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
bf_krnls::bilateral_filter<3><<<grid, threads, 0, stream>>>(1, disp.data, disp.step/sizeof(T), img.data, img.step, disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
}
break;
default:
...
...
modules/gpu/src/cuda/hog.cu
View file @
deac5d97
...
...
@@ -222,6 +222,7 @@ void compute_hists(int nbins, int block_stride_x, int block_stride_y,
int smem = hists_size + final_hists_size;
compute_hists_kernel_many_blocks<nblocks><<<grid, threads, smem>>>(
img_block_width, grad, qangle, scale, block_hists);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -325,6 +326,8 @@ void normalize_hists(int nbins, int block_stride_x, int block_stride_y,
else
cv::gpu::error("normalize_hists: histogram's size is too big, try to decrease number of bins", __FILE__, __LINE__);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -421,6 +424,8 @@ void classify_hists(int win_height, int win_width, int block_stride_y, int block
classify_hists_kernel_many_blocks<nthreads, nblocks><<<grid, threads>>>(
img_win_width, img_block_width, win_block_stride_x, win_block_stride_y,
block_hists, coefs, free_coef, threshold, labels);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -467,6 +472,8 @@ void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, i
block_stride_x;
extract_descrs_by_rows_kernel<nthreads><<<grid, threads>>>(
img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -515,6 +522,8 @@ void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, i
block_stride_x;
extract_descrs_by_cols_kernel<nthreads><<<grid, threads>>>(
img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -640,6 +649,8 @@ void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2D& im
compute_gradients_8UC4_kernel<nthreads, 0><<<gdim, bdim>>>(
height, width, img, angle_scale, grad, qangle);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -713,6 +724,8 @@ void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2D& im
compute_gradients_8UC1_kernel<nthreads, 0><<<gdim, bdim>>>(
height, width, img, angle_scale, grad, qangle);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -749,6 +762,8 @@ void resize_8UC4(const DevMem2D& src, DevMem2D dst)
float sx = (float)src.cols / dst.cols;
float sy = (float)src.rows / dst.rows;
resize_8UC4_kernel<<<grid, threads>>>(sx, sy, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaUnbindTexture(resize8UC4_tex));
...
...
@@ -776,6 +791,8 @@ void resize_8UC1(const DevMem2D& src, DevMem2D dst)
float sx = (float)src.cols / dst.cols;
float sy = (float)src.rows / dst.rows;
resize_8UC1_kernel<<<grid, threads>>>(sx, sy, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaUnbindTexture(resize8UC1_tex));
...
...
modules/gpu/src/cuda/imgproc.cu
View file @
deac5d97
...
...
@@ -137,6 +137,7 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaBindTexture2D(0, tex_remap, src.data, desc, src.cols, src.rows, src.step) );
remap_1c<<<grid, threads>>>(xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaUnbindTexture(tex_remap) );
...
...
@@ -150,6 +151,7 @@ namespace cv { namespace gpu { namespace imgproc
grid.y = divUp(dst.rows, threads.y);
remap_3c<<<grid, threads>>>(src.data, src.step, xmap.data, ymap.data, xmap.step, dst.data, dst.step, dst.cols, dst.rows);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
...
...
@@ -259,6 +261,8 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
meanshift_kernel<<< grid, threads >>>( dst.data, dst.step, dst.cols, dst.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
}
...
...
@@ -273,6 +277,8 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaBindTexture2D( 0, tex_meanshift, src.data, desc, src.cols, src.rows, src.step ) );
meanshiftproc_kernel<<< grid, threads >>>( dstr.data, dstr.step, dstsp.data, dstsp.step, dstr.cols, dstr.rows, sp, sr, maxIter, eps );
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
cudaSafeCall( cudaUnbindTexture( tex_meanshift ) );
}
...
...
@@ -388,6 +394,7 @@ namespace cv { namespace gpu { namespace imgproc
grid.y = divUp(src.rows, threads.y);
drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step, dst.data, dst.step, src.cols, src.rows, ndisp);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -401,6 +408,7 @@ namespace cv { namespace gpu { namespace imgproc
grid.y = divUp(src.rows, threads.y);
drawColorDisp<<<grid, threads, 0, stream>>>(src.data, src.step / sizeof(short), dst.data, dst.step, src.cols, src.rows, ndisp);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -451,6 +459,7 @@ namespace cv { namespace gpu { namespace imgproc
cudaSafeCall( cudaMemcpyToSymbol(cq, q, 16 * sizeof(float)) );
reprojectImageTo3D<<<grid, threads, 0, stream>>>(disp.data, disp.step / sizeof(T), xyzw.data, xyzw.step / sizeof(float), disp.rows, disp.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -491,6 +500,8 @@ namespace cv { namespace gpu { namespace imgproc
dim3 grid(divUp(Dx.cols, threads.x), divUp(Dx.rows, threads.y));
extractCovData_kernel<<<grid, threads>>>(Dx.cols, Dx.rows, Dx, Dy, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -598,6 +609,8 @@ namespace cv { namespace gpu { namespace imgproc
break;
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaUnbindTexture(harrisDxTex));
cudaSafeCall(cudaUnbindTexture(harrisDyTex));
...
...
@@ -712,6 +725,8 @@ namespace cv { namespace gpu { namespace imgproc
break;
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
cudaSafeCall(cudaUnbindTexture(minEigenValDxTex));
cudaSafeCall(cudaUnbindTexture(minEigenValDyTex));
...
...
@@ -746,6 +761,8 @@ namespace cv { namespace gpu { namespace imgproc
dim3 grid(divUp(src.cols, threads.x));
column_sumKernel_32F<<<grid, threads>>>(src.cols, src.rows, src, dst);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -772,6 +789,8 @@ namespace cv { namespace gpu { namespace imgproc
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulSpectrumsKernel<<<grid, threads>>>(a, b, c);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -799,6 +818,8 @@ namespace cv { namespace gpu { namespace imgproc
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, c);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -827,6 +848,8 @@ namespace cv { namespace gpu { namespace imgproc
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulAndScaleSpectrumsKernel<<<grid, threads>>>(a, b, scale, c);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -855,6 +878,8 @@ namespace cv { namespace gpu { namespace imgproc
dim3 grid(divUp(c.cols, threads.x), divUp(c.rows, threads.y));
mulAndScaleSpectrumsKernel_CONJ<<<grid, threads>>>(a, b, scale, c);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
modules/gpu/src/cuda/match_template.cu
View file @
deac5d97
...
...
@@ -132,6 +132,8 @@ void matchTemplateNaive_CCORR_32F(const DevMem2D image, const DevMem2D templ,
templ.cols, templ.rows, image, templ, result);
break;
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -161,6 +163,8 @@ void matchTemplateNaive_CCORR_8U(const DevMem2D image, const DevMem2D templ,
templ.cols, templ.rows, image, templ, result);
break;
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -222,6 +226,8 @@ void matchTemplateNaive_SQDIFF_32F(const DevMem2D image, const DevMem2D templ,
templ.cols, templ.rows, image, templ, result);
break;
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -251,6 +257,8 @@ void matchTemplateNaive_SQDIFF_8U(const DevMem2D image, const DevMem2D templ,
templ.cols, templ.rows, image, templ, result);
break;
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -299,6 +307,8 @@ void matchTemplatePrepared_SQDIFF_8U(
w, h, image_sqsum, templ_sqsum, result);
break;
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -348,6 +358,8 @@ void matchTemplatePrepared_SQDIFF_NORMED_8U(
w, h, image_sqsum, templ_sqsum, result);
break;
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -378,6 +390,8 @@ void matchTemplatePrepared_CCOFF_8U(
dim3 grid(divUp(result.cols, threads.x), divUp(result.rows, threads.y));
matchTemplatePreparedKernel_CCOFF_8U<<<grid, threads>>>(
w, h, (float)templ_sum / (w * h), image_sum, result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -418,6 +432,8 @@ void matchTemplatePrepared_CCOFF_8UC2(
matchTemplatePreparedKernel_CCOFF_8UC2<<<grid, threads>>>(
w, h, (float)templ_sum_r / (w * h), (float)templ_sum_g / (w * h),
image_sum_r, image_sum_g, result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -472,6 +488,8 @@ void matchTemplatePrepared_CCOFF_8UC3(
(float)templ_sum_g / (w * h),
(float)templ_sum_b / (w * h),
image_sum_r, image_sum_g, image_sum_b, result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -536,6 +554,8 @@ void matchTemplatePrepared_CCOFF_8UC4(
(float)templ_sum_a / (w * h),
image_sum_r, image_sum_g, image_sum_b, image_sum_a,
result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -580,6 +600,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8U(
matchTemplatePreparedKernel_CCOFF_NORMED_8U<<<grid, threads>>>(
w, h, weight, templ_sum_scale, templ_sqsum_scale,
image_sum, image_sqsum, result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -641,6 +663,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC2(
image_sum_r, image_sqsum_r,
image_sum_g, image_sqsum_g,
result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -716,6 +740,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC3(
image_sum_g, image_sqsum_g,
image_sum_b, image_sqsum_b,
result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -805,6 +831,8 @@ void matchTemplatePrepared_CCOFF_NORMED_8UC4(
image_sum_b, image_sqsum_b,
image_sum_a, image_sqsum_a,
result);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -847,6 +875,8 @@ void normalize_8U(int w, int h, const DevMem2D_<unsigned long long> image_sqsum,
normalizeKernel_8U<4><<<grid, threads>>>(w, h, image_sqsum, templ_sqsum, result);
break;
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -887,6 +917,8 @@ void extractFirstChannel_32F(const DevMem2D image, DevMem2Df result, int cn)
extractFirstChannel_32F<4><<<grid, threads>>>(image, result);
break;
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
}
...
...
modules/gpu/src/cuda/mathfunc.cu
View file @
deac5d97
...
...
@@ -150,6 +150,7 @@ namespace cv { namespace gpu { namespace mathfunc
cartToPolar<Mag, Angle><<<grid, threads, 0, stream>>>(
x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(),
mag.data, mag.step/mag.elemSize(), angle.data, angle.step/angle.elemSize(), scale, x.cols, x.rows);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -198,6 +199,7 @@ namespace cv { namespace gpu { namespace mathfunc
polarToCart<Mag><<<grid, threads, 0, stream>>>(mag.data, mag.step/mag.elemSize(),
angle.data, angle.step/angle.elemSize(), scale, x.data, x.step/x.elemSize(), y.data, y.step/y.elemSize(), mag.cols, mag.rows);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
modules/gpu/src/cuda/matrix_operations.cu
View file @
deac5d97
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "internal_shared.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/transform.hpp"
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace matrix_operations {
template <typename T> struct shift_and_sizeof;
template <> struct shift_and_sizeof<char> { enum { shift = 0 }; };
template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };
template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };
template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };
///////////////////////////////////////////////////////////////////////////
////////////////////////////////// CopyTo /////////////////////////////////
///////////////////////////////////////////////////////////////////////////
template<typename T>
__global__ void copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, int step_mat, int step_mask, int channels)
{
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
if ((x < cols * channels ) && (y < rows))
if (mask[y * step_mask + x / channels] != 0)
{
size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;
mat_dst[idx] = mat_src[idx];
}
}
typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream);
template<typename T>
void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream)
{
dim3 threadsPerBlock(16,16, 1);
dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
if (stream == 0)
cudaSafeCall ( cudaThreadSynchronize() );
}
void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)
{
static CopyToFunc tab[8] =
{
copy_to_with_mask_run<unsigned char>,
copy_to_with_mask_run<char>,
copy_to_with_mask_run<unsigned short>,
copy_to_with_mask_run<short>,
copy_to_with_mask_run<int>,
copy_to_with_mask_run<float>,
copy_to_with_mask_run<double>,
0
};
CopyToFunc func = tab[depth];
if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);
func(mat_src, mat_dst, mask, channels, stream);
}
///////////////////////////////////////////////////////////////////////////
////////////////////////////////// SetTo //////////////////////////////////
///////////////////////////////////////////////////////////////////////////
__constant__ double scalar_d[4];
template<typename T>
__global__ void set_to_without_mask(T * mat, int cols, int rows, int step, int channels)
{
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
if ((x < cols * channels ) && (y < rows))
{
size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
mat[idx] = scalar_d[ x % channels ];
}
}
template<typename T>
__global__ void set_to_with_mask(T * mat, const unsigned char * mask, int cols, int rows, int step, int channels, int step_mask)
{
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
if ((x < cols * channels ) && (y < rows))
if (mask[y * step_mask + x / channels] != 0)
{
size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
mat[idx] = scalar_d[ x % channels ];
}
}
typedef void (*SetToFunc_with_mask)(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream);
typedef void (*SetToFunc_without_mask)(const DevMem2D& mat, int channels, const cudaStream_t & stream);
template <typename T>
void set_to_with_mask_run(const DevMem2D& mat, const DevMem2D& mask, int channels, const cudaStream_t & stream)
{
dim3 threadsPerBlock(32, 8, 1);
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
set_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.data, (unsigned char *)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);
if (stream == 0)
cudaSafeCall ( cudaThreadSynchronize() );
}
template <typename T>
void set_to_without_mask_run(const DevMem2D& mat, int channels, const cudaStream_t & stream)
{
dim3 threadsPerBlock(32, 8, 1);
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
set_to_without_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);
if (stream == 0)
cudaSafeCall ( cudaThreadSynchronize() );
}
void set_to_without_mask(DevMem2D mat, int depth, const double *scalar, int channels, const cudaStream_t & stream)
{
cudaSafeCall( cudaMemcpyToSymbol(scalar_d, scalar, sizeof(double) * 4));
static SetToFunc_without_mask tab[8] =
{
set_to_without_mask_run<unsigned char>,
set_to_without_mask_run<char>,
set_to_without_mask_run<unsigned short>,
set_to_without_mask_run<short>,
set_to_without_mask_run<int>,
set_to_without_mask_run<float>,
set_to_without_mask_run<double>,
0
};
SetToFunc_without_mask func = tab[depth];
if (func == 0)
cv::gpu::error("Unsupported setTo operation", __FILE__, __LINE__);
func(mat, channels, stream);
}
void set_to_with_mask(DevMem2D mat, int depth, const double * scalar, const DevMem2D& mask, int channels, const cudaStream_t & stream)
{
cudaSafeCall( cudaMemcpyToSymbol(scalar_d, scalar, sizeof(double) * 4));
static SetToFunc_with_mask tab[8] =
{
set_to_with_mask_run<unsigned char>,
set_to_with_mask_run<char>,
set_to_with_mask_run<unsigned short>,
set_to_with_mask_run<short>,
set_to_with_mask_run<int>,
set_to_with_mask_run<float>,
set_to_with_mask_run<double>,
0
};
SetToFunc_with_mask func = tab[depth];
if (func == 0)
cv::gpu::error("Unsupported setTo operation", __FILE__, __LINE__);
func(mat, mask, channels, stream);
}
///////////////////////////////////////////////////////////////////////////
//////////////////////////////// ConvertTo ////////////////////////////////
///////////////////////////////////////////////////////////////////////////
template <typename T, typename D>
class Convertor
{
public:
Convertor(double alpha_, double beta_): alpha(alpha_), beta(beta_) {}
__device__ D operator()(const T& src)
{
return saturate_cast<D>(alpha * src + beta);
}
private:
double alpha, beta;
};
template<typename T, typename D>
void cvt_(const DevMem2D& src, const DevMem2D& dst, double alpha, double beta, cudaStream_t stream)
{
Convertor<T, D> op(alpha, beta);
transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);
}
void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta,
cudaStream_t stream = 0)
{
typedef void (*caller_t)(const DevMem2D& src, const DevMem2D& dst, double alpha, double beta,
cudaStream_t stream);
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "internal_shared.hpp"
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/transform.hpp"
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace matrix_operations {
template <typename T> struct shift_and_sizeof;
template <> struct shift_and_sizeof<signed char> { enum { shift = 0 }; };
template <> struct shift_and_sizeof<unsigned char> { enum { shift = 0 }; };
template <> struct shift_and_sizeof<short> { enum { shift = 1 }; };
template <> struct shift_and_sizeof<unsigned short> { enum { shift = 1 }; };
template <> struct shift_and_sizeof<int> { enum { shift = 2 }; };
template <> struct shift_and_sizeof<float> { enum { shift = 2 }; };
template <> struct shift_and_sizeof<double> { enum { shift = 3 }; };
///////////////////////////////////////////////////////////////////////////
////////////////////////////////// CopyTo /////////////////////////////////
///////////////////////////////////////////////////////////////////////////
template<typename T>
__global__ void copy_to_with_mask(T * mat_src, T * mat_dst, const unsigned char * mask, int cols, int rows, int step_mat, int step_mask, int channels)
{
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
if ((x < cols * channels ) && (y < rows))
if (mask[y * step_mask + x / channels] != 0)
{
size_t idx = y * ( step_mat >> shift_and_sizeof<T>::shift ) + x;
mat_dst[idx] = mat_src[idx];
}
}
typedef void (*CopyToFunc)(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream);
template<typename T>
void copy_to_with_mask_run(const DevMem2D& mat_src, const DevMem2D& mat_dst, const DevMem2D& mask, int channels, const cudaStream_t & stream)
{
dim3 threadsPerBlock(16,16, 1);
dim3 numBlocks ( divUp(mat_src.cols * channels , threadsPerBlock.x) , divUp(mat_src.rows , threadsPerBlock.y), 1);
copy_to_with_mask<T><<<numBlocks,threadsPerBlock, 0, stream>>>
((T*)mat_src.data, (T*)mat_dst.data, (unsigned char*)mask.data, mat_src.cols, mat_src.rows, mat_src.step, mask.step, channels);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall ( cudaThreadSynchronize() );
}
void copy_to_with_mask(const DevMem2D& mat_src, DevMem2D mat_dst, int depth, const DevMem2D& mask, int channels, const cudaStream_t & stream)
{
static CopyToFunc tab[8] =
{
copy_to_with_mask_run<unsigned char>,
copy_to_with_mask_run<signed char>,
copy_to_with_mask_run<unsigned short>,
copy_to_with_mask_run<short>,
copy_to_with_mask_run<int>,
copy_to_with_mask_run<float>,
copy_to_with_mask_run<double>,
0
};
CopyToFunc func = tab[depth];
if (func == 0) cv::gpu::error("Unsupported copyTo operation", __FILE__, __LINE__);
func(mat_src, mat_dst, mask, channels, stream);
}
///////////////////////////////////////////////////////////////////////////
////////////////////////////////// SetTo //////////////////////////////////
///////////////////////////////////////////////////////////////////////////
__constant__ uchar scalar_8u[4];
__constant__ schar scalar_8s[4];
__constant__ ushort scalar_16u[4];
__constant__ short scalar_16s[4];
__constant__ int scalar_32s[4];
__constant__ float scalar_32f[4];
__constant__ double scalar_64f[4];
template <typename T> __device__ T readScalar(int i);
template <> __device__ uchar readScalar<uchar>(int i) {return scalar_8u[i];}
template <> __device__ schar readScalar<schar>(int i) {return scalar_8s[i];}
template <> __device__ ushort readScalar<ushort>(int i) {return scalar_16u[i];}
template <> __device__ short readScalar<short>(int i) {return scalar_16s[i];}
template <> __device__ int readScalar<int>(int i) {return scalar_32s[i];}
template <> __device__ float readScalar<float>(int i) {return scalar_32f[i];}
template <> __device__ double readScalar<double>(int i) {return scalar_64f[i];}
void writeScalar(const uchar* vals)
{
cudaSafeCall( cudaMemcpyToSymbol(scalar_8u, vals, sizeof(uchar) * 4) );
}
void writeScalar(const schar* vals)
{
cudaSafeCall( cudaMemcpyToSymbol(scalar_8s, vals, sizeof(schar) * 4) );
}
void writeScalar(const ushort* vals)
{
cudaSafeCall( cudaMemcpyToSymbol(scalar_16u, vals, sizeof(ushort) * 4) );
}
void writeScalar(const short* vals)
{
cudaSafeCall( cudaMemcpyToSymbol(scalar_16s, vals, sizeof(short) * 4) );
}
void writeScalar(const int* vals)
{
cudaSafeCall( cudaMemcpyToSymbol(scalar_32s, vals, sizeof(int) * 4) );
}
void writeScalar(const float* vals)
{
cudaSafeCall( cudaMemcpyToSymbol(scalar_32f, vals, sizeof(float) * 4) );
}
void writeScalar(const double* vals)
{
cudaSafeCall( cudaMemcpyToSymbol(scalar_64f, vals, sizeof(double) * 4) );
}
template<typename T>
__global__ void set_to_without_mask(T * mat, int cols, int rows, int step, int channels)
{
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
if ((x < cols * channels ) && (y < rows))
{
size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
mat[idx] = readScalar<T>(x % channels);
}
}
template<typename T>
__global__ void set_to_with_mask(T * mat, const unsigned char * mask, int cols, int rows, int step, int channels, int step_mask)
{
size_t x = blockIdx.x * blockDim.x + threadIdx.x;
size_t y = blockIdx.y * blockDim.y + threadIdx.y;
if ((x < cols * channels ) && (y < rows))
if (mask[y * step_mask + x / channels] != 0)
{
size_t idx = y * ( step >> shift_and_sizeof<T>::shift ) + x;
mat[idx] = readScalar<T>(x % channels);
}
}
template <typename T>
void set_to_gpu(const DevMem2D& mat, const T* scalar, const DevMem2D& mask, int channels, cudaStream_t stream)
{
writeScalar(scalar);
dim3 threadsPerBlock(32, 8, 1);
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
set_to_with_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, (uchar*)mask.data, mat.cols, mat.rows, mat.step, channels, mask.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall ( cudaThreadSynchronize() );
}
template void set_to_gpu<uchar >(const DevMem2D& mat, const uchar* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
template void set_to_gpu<schar >(const DevMem2D& mat, const schar* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
template void set_to_gpu<ushort>(const DevMem2D& mat, const ushort* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
template void set_to_gpu<short >(const DevMem2D& mat, const short* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
template void set_to_gpu<int >(const DevMem2D& mat, const int* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
template void set_to_gpu<float >(const DevMem2D& mat, const float* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
template void set_to_gpu<double>(const DevMem2D& mat, const double* scalar, const DevMem2D& mask, int channels, cudaStream_t stream);
template <typename T>
void set_to_gpu(const DevMem2D& mat, const T* scalar, int channels, cudaStream_t stream)
{
writeScalar(scalar);
dim3 threadsPerBlock(32, 8, 1);
dim3 numBlocks (mat.cols * channels / threadsPerBlock.x + 1, mat.rows / threadsPerBlock.y + 1, 1);
set_to_without_mask<T><<<numBlocks, threadsPerBlock, 0, stream>>>((T*)mat.data, mat.cols, mat.rows, mat.step, channels);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall ( cudaThreadSynchronize() );
}
template void set_to_gpu<uchar >(const DevMem2D& mat, const uchar* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<schar >(const DevMem2D& mat, const schar* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<ushort>(const DevMem2D& mat, const ushort* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<short >(const DevMem2D& mat, const short* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<int >(const DevMem2D& mat, const int* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<float >(const DevMem2D& mat, const float* scalar, int channels, cudaStream_t stream);
template void set_to_gpu<double>(const DevMem2D& mat, const double* scalar, int channels, cudaStream_t stream);
///////////////////////////////////////////////////////////////////////////
//////////////////////////////// ConvertTo ////////////////////////////////
///////////////////////////////////////////////////////////////////////////
template <typename T, typename D>
class Convertor
{
public:
Convertor(double alpha_, double beta_) : alpha(alpha_), beta(beta_) {}
__device__ D operator()(const T& src)
{
return saturate_cast<D>(alpha * src + beta);
}
private:
double alpha, beta;
};
template<typename T, typename D>
void cvt_(const DevMem2D& src, const DevMem2D& dst, double alpha, double beta, cudaStream_t stream)
{
cudaSafeCall( cudaSetDoubleForDevice(&alpha) );
cudaSafeCall( cudaSetDoubleForDevice(&beta) );
Convertor<T, D> op(alpha, beta);
transform((DevMem2D_<T>)src, (DevMem2D_<D>)dst, op, stream);
}
void convert_gpu(const DevMem2D& src, int sdepth, const DevMem2D& dst, int ddepth, double alpha, double beta,
cudaStream_t stream = 0)
{
typedef void (*caller_t)(const DevMem2D& src, const DevMem2D& dst, double alpha, double beta,
cudaStream_t stream);
static const caller_t tab[8][8] =
{
{cvt_<uchar, uchar>, cvt_<uchar, schar>, cvt_<uchar, ushort>, cvt_<uchar, short>,
...
...
@@ -272,12 +291,12 @@ namespace cv { namespace gpu { namespace matrix_operations {
cvt_<double, short>, cvt_<double, int>, cvt_<double, float>, cvt_<double, double>, 0},
{0,0,0,0,0,0,0,0}
};
};
caller_t func = tab[sdepth][ddepth];
if (!func)
cv::gpu::error("Unsupported convert operation", __FILE__, __LINE__);
func(src, dst, alpha, beta, stream);
}
}}}
func(src, dst, alpha, beta, stream);
}
}}}
modules/gpu/src/cuda/matrix_reductions.cu
View file @
deac5d97
...
...
@@ -273,6 +273,8 @@ namespace cv { namespace gpu { namespace mathfunc
T* maxval_buf = (T*)buf.ptr(1);
minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
...
...
@@ -302,6 +304,8 @@ namespace cv { namespace gpu { namespace mathfunc
T* maxval_buf = (T*)buf.ptr(1);
minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
...
...
@@ -355,7 +359,10 @@ namespace cv { namespace gpu { namespace mathfunc
T* maxval_buf = (T*)buf.ptr(1);
minMaxKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf);
cudaSafeCall( cudaGetLastError() );
minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
...
...
@@ -384,7 +391,10 @@ namespace cv { namespace gpu { namespace mathfunc
T* maxval_buf = (T*)buf.ptr(1);
minMaxKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf);
cudaSafeCall( cudaGetLastError() );
minMaxPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
...
...
@@ -597,6 +607,8 @@ namespace cv { namespace gpu { namespace mathfunc
minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf,
minloc_buf, maxloc_buf);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
...
...
@@ -636,6 +648,8 @@ namespace cv { namespace gpu { namespace mathfunc
minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf,
minloc_buf, maxloc_buf);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
...
...
@@ -706,7 +720,10 @@ namespace cv { namespace gpu { namespace mathfunc
minMaxLocKernel<256, T, Mask8U><<<grid, threads>>>(src, Mask8U(mask), minval_buf, maxval_buf,
minloc_buf, maxloc_buf);
cudaSafeCall( cudaGetLastError() );
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
...
...
@@ -745,7 +762,10 @@ namespace cv { namespace gpu { namespace mathfunc
minMaxLocKernel<256, T, MaskTrue><<<grid, threads>>>(src, MaskTrue(), minval_buf, maxval_buf,
minloc_buf, maxloc_buf);
cudaSafeCall( cudaGetLastError() );
minMaxLocPass2Kernel<256, T><<<1, 256>>>(minval_buf, maxval_buf, minloc_buf, maxloc_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
T minval_, maxval_;
...
...
@@ -873,6 +893,8 @@ namespace cv { namespace gpu { namespace mathfunc
uint* count_buf = (uint*)buf.ptr(0);
countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
uint count;
...
...
@@ -916,7 +938,10 @@ namespace cv { namespace gpu { namespace mathfunc
uint* count_buf = (uint*)buf.ptr(0);
countNonZeroKernel<256, T><<<grid, threads>>>(src, count_buf);
cudaSafeCall( cudaGetLastError() );
countNonZeroPass2Kernel<256, T><<<1, 256>>>(count_buf, grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
uint count;
...
...
@@ -1430,26 +1455,42 @@ namespace cv { namespace gpu { namespace mathfunc
case 1:
sumKernel<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
cudaSafeCall( cudaGetLastError() );
sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
break;
case 2:
sumKernel_C2<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
cudaSafeCall( cudaGetLastError() );
sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
break;
case 3:
sumKernel_C3<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
cudaSafeCall( cudaGetLastError() );
sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
break;
case 4:
sumKernel_C4<T, R, IdentityOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
cudaSafeCall( cudaGetLastError() );
sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
break;
}
cudaSafeCall(cudaThreadSynchronize());
...
...
@@ -1500,6 +1541,8 @@ namespace cv { namespace gpu { namespace mathfunc
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
break;
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
R result[4] = {0, 0, 0, 0};
...
...
@@ -1534,26 +1577,42 @@ namespace cv { namespace gpu { namespace mathfunc
case 1:
sumKernel<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
cudaSafeCall( cudaGetLastError() );
sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
break;
case 2:
sumKernel_C2<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
cudaSafeCall( cudaGetLastError() );
sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
break;
case 3:
sumKernel_C3<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
cudaSafeCall( cudaGetLastError() );
sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
break;
case 4:
sumKernel_C4<T, R, AbsOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
cudaSafeCall( cudaGetLastError() );
sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
break;
}
cudaSafeCall(cudaThreadSynchronize());
...
...
@@ -1604,6 +1663,8 @@ namespace cv { namespace gpu { namespace mathfunc
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
break;
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
R result[4] = {0, 0, 0, 0};
...
...
@@ -1638,26 +1699,42 @@ namespace cv { namespace gpu { namespace mathfunc
case 1:
sumKernel<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 1>::vec_t*)buf.ptr(0));
cudaSafeCall( cudaGetLastError() );
sumPass2Kernel<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 1>::vec_t*)buf.ptr(0), grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
break;
case 2:
sumKernel_C2<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 2>::vec_t*)buf.ptr(0));
cudaSafeCall( cudaGetLastError() );
sumPass2Kernel_C2<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 2>::vec_t*)buf.ptr(0), grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
break;
case 3:
sumKernel_C3<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 3>::vec_t*)buf.ptr(0));
cudaSafeCall( cudaGetLastError() );
sumPass2Kernel_C3<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 3>::vec_t*)buf.ptr(0), grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
break;
case 4:
sumKernel_C4<T, R, SqrOp<R>, threads_x * threads_y><<<grid, threads>>>(
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
cudaSafeCall( cudaGetLastError() );
sumPass2Kernel_C4<T, R, threads_x * threads_y><<<1, threads_x * threads_y>>>(
(typename TypeVec<R, 4>::vec_t*)buf.ptr(0), grid.x * grid.y);
cudaSafeCall( cudaGetLastError() );
break;
}
cudaSafeCall(cudaThreadSynchronize());
...
...
@@ -1708,6 +1785,8 @@ namespace cv { namespace gpu { namespace mathfunc
src, (typename TypeVec<R, 4>::vec_t*)buf.ptr(0));
break;
}
cudaSafeCall( cudaGetLastError() );
cudaSafeCall(cudaThreadSynchronize());
R result[4] = {0, 0, 0, 0};
...
...
modules/gpu/src/cuda/split_merge.cu
View file @
deac5d97
...
...
@@ -233,6 +233,8 @@ namespace cv { namespace gpu { namespace split_merge {
src[0].data, src[0].step,
src[1].data, src[1].step,
dst.rows, dst.cols, dst.data, dst.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -248,6 +250,8 @@ namespace cv { namespace gpu { namespace split_merge {
src[1].data, src[1].step,
src[2].data, src[2].step,
dst.rows, dst.cols, dst.data, dst.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -264,6 +268,8 @@ namespace cv { namespace gpu { namespace split_merge {
src[2].data, src[2].step,
src[3].data, src[3].step,
dst.rows, dst.cols, dst.data, dst.step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -436,6 +442,8 @@ namespace cv { namespace gpu { namespace split_merge {
src.data, src.step, src.rows, src.cols,
dst[0].data, dst[0].step,
dst[1].data, dst[1].step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -451,6 +459,8 @@ namespace cv { namespace gpu { namespace split_merge {
dst[0].data, dst[0].step,
dst[1].data, dst[1].step,
dst[2].data, dst[2].step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
}
...
...
@@ -467,6 +477,8 @@ namespace cv { namespace gpu { namespace split_merge {
dst[1].data, dst[1].step,
dst[2].data, dst[2].step,
dst[3].data, dst[3].step);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall(cudaThreadSynchronize());
}
...
...
modules/gpu/src/cuda/stereobm.cu
View file @
deac5d97
...
...
@@ -325,6 +325,8 @@ template<int RADIUS> void kernel_caller(const DevMem2D& left, const DevMem2D& ri
size_t smem_size = (BLOCK_W + N_DISPARITIES * (BLOCK_W + 2 * RADIUS)) * sizeof(unsigned int);
stereoKernel<RADIUS><<<grid, threads, smem_size, stream>>>(left.data, right.data, left.step, disp, maxdisp);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
};
...
...
@@ -402,6 +404,7 @@ extern "C" void prefilter_xsobel(const DevMem2D& input, const DevMem2D& output,
grid.y = divUp(input.rows, threads.y);
prefilter_kernel<<<grid, threads, 0, stream>>>(output, prefilterCap);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -526,6 +529,7 @@ extern "C" void postfilter_textureness(const DevMem2D& input, int winsz, float a
size_t smem_size = (threads.x + threads.x + (winsz/2) * 2 ) * sizeof(float);
textureness_kernel<<<grid, threads, smem_size, stream>>>(disp, winsz, avgTexturenessThreshold);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
modules/gpu/src/cuda/stereobp.cu
View file @
deac5d97
...
...
@@ -172,6 +172,7 @@ namespace cv { namespace gpu { namespace bp
grid.y = divUp(left.rows, threads.y);
comp_data<1, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -185,6 +186,7 @@ namespace cv { namespace gpu { namespace bp
grid.y = divUp(left.rows, threads.y);
comp_data<1, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -199,6 +201,7 @@ namespace cv { namespace gpu { namespace bp
grid.y = divUp(left.rows, threads.y);
comp_data<3, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -212,6 +215,7 @@ namespace cv { namespace gpu { namespace bp
grid.y = divUp(left.rows, threads.y);
comp_data<3, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -226,6 +230,7 @@ namespace cv { namespace gpu { namespace bp
grid.y = divUp(left.rows, threads.y);
comp_data<4, short><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<short>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -239,6 +244,7 @@ namespace cv { namespace gpu { namespace bp
grid.y = divUp(left.rows, threads.y);
comp_data<4, float><<<grid, threads, 0, stream>>>(left, right, (DevMem2D_<float>)data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -278,6 +284,7 @@ namespace cv { namespace gpu { namespace bp
grid.y = divUp(dst_rows, threads.y);
data_step_down<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)src, (DevMem2D_<T>)dst);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -321,9 +328,13 @@ namespace cv { namespace gpu { namespace bp
int src_idx = (dst_idx + 1) & 1;
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mus[src_idx], (DevMem2D_<T>)mus[dst_idx]);
cudaSafeCall( cudaGetLastError() );
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mds[src_idx], (DevMem2D_<T>)mds[dst_idx]);
cudaSafeCall( cudaGetLastError() );
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mls[src_idx], (DevMem2D_<T>)mls[dst_idx]);
cudaSafeCall( cudaGetLastError() );
level_up_message<T><<<grid, threads, 0, stream>>>(dst_cols, dst_rows, src_rows, (DevMem2D_<T>)mrs[src_idx], (DevMem2D_<T>)mrs[dst_idx]);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -443,6 +454,7 @@ namespace cv { namespace gpu { namespace bp
for(int t = 0; t < iters; ++t)
{
one_iteration<T><<<grid, threads, 0, stream>>>(t, (DevMem2D_<T>)u, (T*)d.data, (T*)l.data, (T*)r.data, (DevMem2D_<T>)data, cols, rows);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -505,6 +517,7 @@ namespace cv { namespace gpu { namespace bp
grid.y = divUp(disp.rows, threads.y);
output<T><<<grid, threads, 0, stream>>>((DevMem2D_<T>)u, (const T*)d.data, (const T*)l.data, (const T*)r.data, (const T*)data.data, disp);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
modules/gpu/src/cuda/stereocsbp.cu
View file @
deac5d97
...
...
@@ -382,6 +382,8 @@ namespace cv { namespace gpu { namespace csbp
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step1, &msg_step, sizeof(size_t)) );
init_data_cost_callers[level](rows, cols, h, w, level, ndisp, channels, stream);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -395,6 +397,9 @@ namespace cv { namespace gpu { namespace csbp
get_first_k_initial_local<<<grid, threads, 0, stream>>> (data_cost_selected, disp_selected_pyr, h, w, nr_plane);
else
get_first_k_initial_global<<<grid, threads, 0, stream>>>(data_cost_selected, disp_selected_pyr, h, w, nr_plane);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
}
...
...
@@ -578,6 +583,7 @@ namespace cv { namespace gpu { namespace csbp
cudaSafeCall( cudaMemcpyToSymbol(cmsg_step2, &msg_step2, sizeof(size_t)) );
callers[level](disp_selected_pyr, data_cost, rows, cols, h, w, level, nr_plane, channels, stream);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -700,10 +706,11 @@ namespace cv { namespace gpu { namespace csbp
grid.y = divUp(h, threads.y);
init_message<<<grid, threads, 0, stream>>>(u_new, d_new, l_new, r_new,
u_cur, d_cur, l_cur, r_cur,
selected_disp_pyr_new, selected_disp_pyr_cur,
data_cost_selected, data_cost,
h, w, nr_plane, h2, w2, nr_plane2);
u_cur, d_cur, l_cur, r_cur,
selected_disp_pyr_new, selected_disp_pyr_cur,
data_cost_selected, data_cost,
h, w, nr_plane, h2, w2, nr_plane2);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -805,6 +812,7 @@ namespace cv { namespace gpu { namespace csbp
for(int t = 0; t < iters; ++t)
{
compute_message<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, selected_disp_pyr_cur, h, w, nr_plane, t & 1);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
...
...
@@ -873,7 +881,9 @@ namespace cv { namespace gpu { namespace csbp
grid.y = divUp(disp.rows, threads.y);
compute_disp<<<grid, threads, 0, stream>>>(u, d, l, r, data_cost_selected, disp_selected,
disp.data, disp.step / disp.elemSize(), disp.cols, disp.rows, nr_plane);
disp.data, disp.step / disp.elemSize(), disp.cols, disp.rows, nr_plane);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaThreadSynchronize() );
}
...
...
modules/gpu/src/cuda/surf.cu
View file @
deac5d97
...
...
@@ -238,6 +238,46 @@ namespace cv { namespace gpu { namespace surf
hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;
}
}
__global__ void fasthessian_old(PtrStepf hessianBuffer)
{
// Determine the indices in the Hessian buffer
int gridDim_y = gridDim.y / c_nIntervals;
int blockIdx_y = blockIdx.y % gridDim_y;
int blockIdx_z = blockIdx.y / gridDim_y;
int hidx_x = threadIdx.x + blockIdx.x * blockDim.x;
int hidx_y = threadIdx.y + blockIdx_y * blockDim.y;
int hidx_z = blockIdx_z;
float fscale = calcScale(hidx_z);
// Compute the lookup location of the mask center
float x = hidx_x * c_step + c_border;
float y = hidx_y * c_step + c_border;
// Scale the mask dimensions according to the scale
if (hidx_x < c_x_size && hidx_y < c_y_size && hidx_z < c_nIntervals)
{
float mask_width = c_mask_width * fscale;
float mask_height = c_mask_height * fscale;
// Compute the filter responses
float Dyy = evalDyy(x, y, c_mask_height, mask_width, mask_height, fscale);
float Dxx = evalDxx(x, y, c_mask_height, mask_width, mask_height, fscale);
float Dxy = evalDxy(x, y, fscale);
// Combine the responses and store the Laplacian sign
float result = (Dxx * Dyy) - c_dxy_scale * (Dxy * Dxy);
if (Dxx + Dyy > 0.f)
setLastBit(result);
else
clearLastBit(result);
hessianBuffer.ptr(c_y_size * hidx_z + hidx_y)[hidx_x] = result;
}
}
dim3 calcBlockSize(int nIntervals)
{
...
...
@@ -263,6 +303,21 @@ namespace cv { namespace gpu { namespace surf
grid.y = divUp(y_size, threads.y);
fasthessian<<<grid, threads>>>(hessianBuffer);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
void fasthessian_gpu_old(PtrStepf hessianBuffer, int x_size, int y_size, const dim3& threadsOld)
{
dim3 threads(16, 16);
dim3 grid;
grid.x = divUp(x_size, threads.x);
grid.y = divUp(y_size, threads.y) * threadsOld.z;
fasthessian_old<<<grid, threads>>>(hessianBuffer);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
...
...
@@ -395,6 +450,8 @@ namespace cv { namespace gpu { namespace surf
nonmaxonly<WithMask><<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapper);
else
nonmaxonly<WithOutMask><<<grid, threads, smem_size>>>(hessianBuffer, maxPosBuffer, maxCounterWrapper);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
...
...
@@ -574,6 +631,7 @@ namespace cv { namespace gpu { namespace surf
DeviceReference<unsigned int> featureCounterWrapper(featureCounter);
fh_interp_extremum<<<grid, threads>>>(hessianBuffer, maxPosBuffer, featuresBuffer, featureCounterWrapper);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
...
...
@@ -715,6 +773,8 @@ namespace cv { namespace gpu { namespace surf
grid.x = nFeatures;
find_orientation<<<grid, threads>>>(features);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
...
...
@@ -987,17 +1047,255 @@ namespace cv { namespace gpu { namespace surf
if (descriptors.cols == 64)
{
compute_descriptors64<<<dim3(nFeatures, 1, 1), dim3(25, 4, 4)>>>(descriptors, features);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
else
{
compute_descriptors128<<<dim3(nFeatures, 1, 1), dim3(25, 4, 4)>>>(descriptors, features);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
}
__device__ void calc_dx_dy_old(float sdx[25], float sdy[25], const KeyPoint_GPU* features, int tid)
{
// get the interest point parameters (x, y, scale, strength, theta)
__shared__ float ipt[5];
if (tid < 5)
{
ipt[tid] = ((float*)&features[blockIdx.x])[tid];
}
__syncthreads();
float sin_theta, cos_theta;
sincosf(ipt[SF_ANGLE], &sin_theta, &cos_theta);
// Compute sampling points
// since grids are 2D, need to compute xBlock and yBlock indices
const int xBlock = (blockIdx.y & 3); // blockIdx.y % 4
const int yBlock = (blockIdx.y >> 2); // floor(blockIdx.y/4)
const int xIndex = xBlock * blockDim.x + threadIdx.x;
const int yIndex = yBlock * blockDim.y + threadIdx.y;
// Compute rotated sampling points
// (clockwise rotation since we are rotating the lattice)
// (subtract 9.5f to start sampling at the top left of the lattice, 0.5f is to space points out properly - there is no center pixel)
const float sample_x = ipt[SF_X] + (cos_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE]
+ sin_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);
const float sample_y = ipt[SF_Y] + (-sin_theta * ((float) (xIndex-9.5f)) * ipt[SF_SIZE]
+ cos_theta * ((float) (yIndex-9.5f)) * ipt[SF_SIZE]);
// gather integral image lookups for Haar wavelets at each point (some lookups are shared between dx and dy)
// a b c
// d f
// g h i
const float a = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
const float b = tex2D(sumTex, sample_x, sample_y - ipt[SF_SIZE]);
const float c = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y - ipt[SF_SIZE]);
const float d = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y);
const float f = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y);
const float g = tex2D(sumTex, sample_x - ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);
const float h = tex2D(sumTex, sample_x, sample_y + ipt[SF_SIZE]);
const float i = tex2D(sumTex, sample_x + ipt[SF_SIZE], sample_y + ipt[SF_SIZE]);
// compute axis-aligned HaarX, HaarY
// (could group the additions together into multiplications)
const float gauss = c_3p3gauss1D[xIndex] * c_3p3gauss1D[yIndex]; // separable because independent (circular)
const float aa_dx = gauss * (-(a-b-g+h) + (b-c-h+i)); // unrotated dx
const float aa_dy = gauss * (-(a-c-d+f) + (d-f-g+i)); // unrotated dy
// rotate responses (store all dxs then all dys)
// - counterclockwise rotation to rotate back to zero orientation
sdx[tid] = aa_dx * cos_theta - aa_dy * sin_theta; // rotated dx
sdy[tid] = aa_dx * sin_theta + aa_dy * cos_theta; // rotated dy
}
__device__ void reduce_sum_old(float sdata[25], int tid)
{
// first step is to reduce from 25 to 16
if (tid < 9) // use 9 threads
sdata[tid] += sdata[tid + 16];
__syncthreads();
// sum (reduce) from 16 to 1 (unrolled - aligned to a half-warp)
if (tid < 16)
{
volatile float* smem = sdata;
smem[tid] += smem[tid + 8];
smem[tid] += smem[tid + 4];
smem[tid] += smem[tid + 2];
smem[tid] += smem[tid + 1];
}
}
// Spawn 16 blocks per interest point
// - computes unnormalized 64 dimensional descriptor, puts it into d_descriptors in the correct location
__global__ void compute_descriptors64_old(PtrStepf descriptors, const KeyPoint_GPU* features)
{
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 2);
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
__shared__ float sdx[25];
__shared__ float sdy[25];
calc_dx_dy_old(sdx, sdy, features, tid);
__syncthreads();
__shared__ float sabs[25];
sabs[tid] = fabs(sdx[tid]); // |dx| array
__syncthreads();
reduce_sum_old(sdx, tid);
reduce_sum_old(sdy, tid);
reduce_sum_old(sabs, tid);
// write dx, dy, |dx|
if (tid == 0)
{
descriptors_block[0] = sdx[0];
descriptors_block[1] = sdy[0];
descriptors_block[2] = sabs[0];
}
__syncthreads();
sabs[tid] = fabs(sdy[tid]); // |dy| array
__syncthreads();
reduce_sum_old(sabs, tid);
// write |dy|
if (tid == 0)
{
descriptors_block[3] = sabs[0];
}
}
// Spawn 16 blocks per interest point
// - computes unnormalized 128 dimensional descriptor, puts it into d_descriptors in the correct location
__global__ void compute_descriptors128_old(PtrStepf descriptors, const KeyPoint_GPU* features)
{
float* descriptors_block = descriptors.ptr(blockIdx.x) + (blockIdx.y << 3);
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
// 2 floats (dx,dy) for each thread (5x5 sample points in each sub-region)
__shared__ float sdx[25];
__shared__ float sdy[25];
calc_dx_dy_old(sdx, sdy, features, tid);
__syncthreads();
// sum (reduce) 5x5 area response
__shared__ float sd1[25];
__shared__ float sd2[25];
__shared__ float sdabs1[25];
__shared__ float sdabs2[25];
if (sdy[tid] >= 0)
{
sd1[tid] = sdx[tid];
sdabs1[tid] = fabs(sdx[tid]);
sd2[tid] = 0;
sdabs2[tid] = 0;
}
else
{
sd1[tid] = 0;
sdabs1[tid] = 0;
sd2[tid] = sdx[tid];
sdabs2[tid] = fabs(sdx[tid]);
}
__syncthreads();
reduce_sum_old(sd1, tid);
reduce_sum_old(sd2, tid);
reduce_sum_old(sdabs1, tid);
reduce_sum_old(sdabs2, tid);
// write dx (dy >= 0), |dx| (dy >= 0), dx (dy < 0), |dx| (dy < 0)
if (tid == 0)
{
descriptors_block[0] = sd1[0];
descriptors_block[1] = sdabs1[0];
descriptors_block[2] = sd2[0];
descriptors_block[3] = sdabs2[0];
}
__syncthreads();
if (sdx[tid] >= 0)
{
sd1[tid] = sdy[tid];
sdabs1[tid] = fabs(sdy[tid]);
sd2[tid] = 0;
sdabs2[tid] = 0;
}
else
{
sd1[tid] = 0;
sdabs1[tid] = 0;
sd2[tid] = sdy[tid];
sdabs2[tid] = fabs(sdy[tid]);
}
__syncthreads();
reduce_sum_old(sd1, tid);
reduce_sum_old(sd2, tid);
reduce_sum_old(sdabs1, tid);
reduce_sum_old(sdabs2, tid);
// write dy (dx >= 0), |dy| (dx >= 0), dy (dx < 0), |dy| (dx < 0)
if (tid == 0)
{
descriptors_block[4] = sd1[0];
descriptors_block[5] = sdabs1[0];
descriptors_block[6] = sd2[0];
descriptors_block[7] = sdabs2[0];
}
}
void compute_descriptors_gpu_old(const DevMem2Df& descriptors, const KeyPoint_GPU* features, int nFeatures)
{
// compute unnormalized descriptors, then normalize them - odd indexing since grid must be 2D
if (descriptors.cols == 64)
{
compute_descriptors64_old<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
normalize_descriptors<64><<<dim3(nFeatures, 1, 1), dim3(64, 1, 1)>>>(descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
else
{
compute_descriptors128_old<<<dim3(nFeatures, 16, 1), dim3(5, 5, 1)>>>(descriptors, features);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
normalize_descriptors<128><<<dim3(nFeatures, 1, 1), dim3(128, 1, 1)>>>(descriptors);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaThreadSynchronize() );
}
}
...
...
modules/gpu/src/cudastream.cpp
View file @
deac5d97
...
...
@@ -61,8 +61,8 @@ void cv::gpu::Stream::enqueueDownload(const GpuMat& /*src*/, CudaMem& /*dst*/) {
void
cv
::
gpu
::
Stream
::
enqueueUpload
(
const
CudaMem
&
/*src*/
,
GpuMat
&
/*dst*/
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
Stream
::
enqueueUpload
(
const
Mat
&
/*src*/
,
GpuMat
&
/*dst*/
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
Stream
::
enqueueCopy
(
const
GpuMat
&
/*src*/
,
GpuMat
&
/*dst*/
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
Stream
::
enqueueMemSet
(
const
GpuMat
&
/*src*/
,
Scalar
/*val*/
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
Stream
::
enqueueMemSet
(
const
GpuMat
&
/*src*/
,
Scalar
/*val*/
,
const
GpuMat
&
/*mask*/
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
Stream
::
enqueueMemSet
(
GpuMat
&
/*src*/
,
Scalar
/*val*/
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
Stream
::
enqueueMemSet
(
GpuMat
&
/*src*/
,
Scalar
/*val*/
,
const
GpuMat
&
/*mask*/
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
Stream
::
enqueueConvert
(
const
GpuMat
&
/*src*/
,
GpuMat
&
/*dst*/
,
int
/*type*/
,
double
/*a*/
,
double
/*b*/
)
{
throw_nogpu
();
}
#else
/* !defined (HAVE_CUDA) */
...
...
@@ -77,8 +77,10 @@ namespace cv
{
void
copy_to_with_mask
(
const
DevMem2D
&
src
,
DevMem2D
dst
,
int
depth
,
const
DevMem2D
&
mask
,
int
channels
,
const
cudaStream_t
&
stream
=
0
);
void
set_to_without_mask
(
DevMem2D
dst
,
int
depth
,
const
double
*
scalar
,
int
channels
,
const
cudaStream_t
&
stream
=
0
);
void
set_to_with_mask
(
DevMem2D
dst
,
int
depth
,
const
double
*
scalar
,
const
DevMem2D
&
mask
,
int
channels
,
const
cudaStream_t
&
stream
=
0
);
template
<
typename
T
>
void
set_to_gpu
(
const
DevMem2D
&
mat
,
const
T
*
scalar
,
int
channels
,
cudaStream_t
stream
);
template
<
typename
T
>
void
set_to_gpu
(
const
DevMem2D
&
mat
,
const
T
*
scalar
,
const
DevMem2D
&
mask
,
int
channels
,
cudaStream_t
stream
);
void
convert_gpu
(
const
DevMem2D
&
src
,
int
sdepth
,
const
DevMem2D
&
dst
,
int
ddepth
,
double
alpha
,
double
beta
,
cudaStream_t
stream
=
0
);
}
...
...
@@ -99,6 +101,20 @@ namespace
size_t
bwidth
=
src
.
cols
*
src
.
elemSize
();
cudaSafeCall
(
cudaMemcpy2DAsync
(
dst
.
data
,
dst
.
step
,
src
.
data
,
src
.
step
,
bwidth
,
src
.
rows
,
k
,
s
)
);
};
template
<
typename
T
>
void
kernelSet
(
GpuMat
&
src
,
const
Scalar
&
s
,
cudaStream_t
stream
)
{
Scalar_
<
T
>
sf
=
s
;
matrix_operations
::
set_to_gpu
(
src
,
sf
.
val
,
src
.
channels
(),
stream
);
}
template
<
typename
T
>
void
kernelSetMask
(
GpuMat
&
src
,
const
Scalar
&
s
,
const
GpuMat
&
mask
,
cudaStream_t
stream
)
{
Scalar_
<
T
>
sf
=
s
;
matrix_operations
::
set_to_gpu
(
src
,
sf
.
val
,
mask
,
src
.
channels
(),
stream
);
}
}
CV_EXPORTS
cudaStream_t
cv
::
gpu
::
StreamAccessor
::
getStream
(
const
Stream
&
stream
)
{
return
stream
.
impl
->
stream
;
};
...
...
@@ -172,14 +188,26 @@ void cv::gpu::Stream::enqueueUpload(const CudaMem& src, GpuMat& dst){ devcopy(sr
void
cv
::
gpu
::
Stream
::
enqueueUpload
(
const
Mat
&
src
,
GpuMat
&
dst
)
{
devcopy
(
src
,
dst
,
impl
->
stream
,
cudaMemcpyHostToDevice
);
}
void
cv
::
gpu
::
Stream
::
enqueueCopy
(
const
GpuMat
&
src
,
GpuMat
&
dst
)
{
devcopy
(
src
,
dst
,
impl
->
stream
,
cudaMemcpyDeviceToDevice
);
}
void
cv
::
gpu
::
Stream
::
enqueueMemSet
(
const
GpuMat
&
src
,
Scalar
val
)
void
cv
::
gpu
::
Stream
::
enqueueMemSet
(
GpuMat
&
src
,
Scalar
val
)
{
matrix_operations
::
set_to_without_mask
(
src
,
src
.
depth
(),
val
.
val
,
src
.
channels
(),
impl
->
stream
);
typedef
void
(
*
set_caller_t
)(
GpuMat
&
src
,
const
Scalar
&
s
,
cudaStream_t
stream
);
static
const
set_caller_t
set_callers
[]
=
{
kernelSet
<
uchar
>
,
kernelSet
<
schar
>
,
kernelSet
<
ushort
>
,
kernelSet
<
short
>
,
kernelSet
<
int
>
,
kernelSet
<
float
>
,
kernelSet
<
double
>
};
set_callers
[
src
.
depth
()](
src
,
val
,
impl
->
stream
);
}
void
cv
::
gpu
::
Stream
::
enqueueMemSet
(
const
GpuMat
&
src
,
Scalar
val
,
const
GpuMat
&
mask
)
void
cv
::
gpu
::
Stream
::
enqueueMemSet
(
GpuMat
&
src
,
Scalar
val
,
const
GpuMat
&
mask
)
{
matrix_operations
::
set_to_with_mask
(
src
,
src
.
depth
(),
val
.
val
,
mask
,
src
.
channels
(),
impl
->
stream
);
typedef
void
(
*
set_caller_t
)(
GpuMat
&
src
,
const
Scalar
&
s
,
const
GpuMat
&
mask
,
cudaStream_t
stream
);
static
const
set_caller_t
set_callers
[]
=
{
kernelSetMask
<
uchar
>
,
kernelSetMask
<
schar
>
,
kernelSetMask
<
ushort
>
,
kernelSetMask
<
short
>
,
kernelSetMask
<
int
>
,
kernelSetMask
<
float
>
,
kernelSetMask
<
double
>
};
set_callers
[
src
.
depth
()](
src
,
val
,
mask
,
impl
->
stream
);
}
void
cv
::
gpu
::
Stream
::
enqueueConvert
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
int
rtype
,
double
alpha
,
double
beta
)
...
...
modules/gpu/src/element_operations.cpp
View file @
deac5d97
...
...
@@ -585,10 +585,10 @@ namespace cv { namespace gpu { namespace mathfunc
void
max_gpu
(
const
DevMem2D_
<
T
>&
src1
,
const
DevMem2D_
<
T
>&
src2
,
const
DevMem2D_
<
T
>&
dst
,
cudaStream_t
stream
);
template
<
typename
T
>
void
min_gpu
(
const
DevMem2D_
<
T
>&
src1
,
double
src2
,
const
DevMem2D_
<
T
>&
dst
,
cudaStream_t
stream
);
void
min_gpu
(
const
DevMem2D_
<
T
>&
src1
,
T
src2
,
const
DevMem2D_
<
T
>&
dst
,
cudaStream_t
stream
);
template
<
typename
T
>
void
max_gpu
(
const
DevMem2D_
<
T
>&
src1
,
double
src2
,
const
DevMem2D_
<
T
>&
dst
,
cudaStream_t
stream
);
void
max_gpu
(
const
DevMem2D_
<
T
>&
src1
,
T
src2
,
const
DevMem2D_
<
T
>&
dst
,
cudaStream_t
stream
);
}}}
namespace
...
...
@@ -605,7 +605,7 @@ namespace
void
min_caller
(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
)
{
dst
.
create
(
src1
.
size
(),
src1
.
type
());
mathfunc
::
min_gpu
<
T
>
(
src1
.
reshape
(
1
),
s
rc2
,
dst
.
reshape
(
1
),
stream
);
mathfunc
::
min_gpu
<
T
>
(
src1
.
reshape
(
1
),
s
aturate_cast
<
T
>
(
src2
)
,
dst
.
reshape
(
1
),
stream
);
}
template
<
typename
T
>
...
...
@@ -620,7 +620,7 @@ namespace
void
max_caller
(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
)
{
dst
.
create
(
src1
.
size
(),
src1
.
type
());
mathfunc
::
max_gpu
<
T
>
(
src1
.
reshape
(
1
),
s
rc2
,
dst
.
reshape
(
1
),
stream
);
mathfunc
::
max_gpu
<
T
>
(
src1
.
reshape
(
1
),
s
aturate_cast
<
T
>
(
src2
)
,
dst
.
reshape
(
1
),
stream
);
}
}
...
...
@@ -629,7 +629,7 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
min_caller
<
uchar
>
,
min_caller
<
char
>
,
min_caller
<
ushort
>
,
min_caller
<
short
>
,
min_caller
<
int
>
,
min_caller
<
uchar
>
,
min_caller
<
s
char
>
,
min_caller
<
ushort
>
,
min_caller
<
short
>
,
min_caller
<
int
>
,
min_caller
<
float
>
,
min_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
0
);
...
...
@@ -640,7 +640,7 @@ void cv::gpu::min(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Str
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
min_caller
<
uchar
>
,
min_caller
<
char
>
,
min_caller
<
ushort
>
,
min_caller
<
short
>
,
min_caller
<
int
>
,
min_caller
<
uchar
>
,
min_caller
<
s
char
>
,
min_caller
<
ushort
>
,
min_caller
<
short
>
,
min_caller
<
int
>
,
min_caller
<
float
>
,
min_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
StreamAccessor
::
getStream
(
stream
));
...
...
@@ -651,7 +651,7 @@ void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst)
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
min_caller
<
uchar
>
,
min_caller
<
char
>
,
min_caller
<
ushort
>
,
min_caller
<
short
>
,
min_caller
<
int
>
,
min_caller
<
uchar
>
,
min_caller
<
s
char
>
,
min_caller
<
ushort
>
,
min_caller
<
short
>
,
min_caller
<
int
>
,
min_caller
<
float
>
,
min_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
0
);
...
...
@@ -662,7 +662,7 @@ void cv::gpu::min(const GpuMat& src1, double src2, GpuMat& dst, const Stream& st
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
min_caller
<
uchar
>
,
min_caller
<
char
>
,
min_caller
<
ushort
>
,
min_caller
<
short
>
,
min_caller
<
int
>
,
min_caller
<
uchar
>
,
min_caller
<
s
char
>
,
min_caller
<
ushort
>
,
min_caller
<
short
>
,
min_caller
<
int
>
,
min_caller
<
float
>
,
min_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
StreamAccessor
::
getStream
(
stream
));
...
...
@@ -673,7 +673,7 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst)
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
max_caller
<
uchar
>
,
max_caller
<
char
>
,
max_caller
<
ushort
>
,
max_caller
<
short
>
,
max_caller
<
int
>
,
max_caller
<
uchar
>
,
max_caller
<
s
char
>
,
max_caller
<
ushort
>
,
max_caller
<
short
>
,
max_caller
<
int
>
,
max_caller
<
float
>
,
max_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
0
);
...
...
@@ -684,7 +684,7 @@ void cv::gpu::max(const GpuMat& src1, const GpuMat& src2, GpuMat& dst, const Str
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
const
GpuMat
&
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
max_caller
<
uchar
>
,
max_caller
<
char
>
,
max_caller
<
ushort
>
,
max_caller
<
short
>
,
max_caller
<
int
>
,
max_caller
<
uchar
>
,
max_caller
<
s
char
>
,
max_caller
<
ushort
>
,
max_caller
<
short
>
,
max_caller
<
int
>
,
max_caller
<
float
>
,
max_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
StreamAccessor
::
getStream
(
stream
));
...
...
@@ -695,7 +695,7 @@ void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst)
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
max_caller
<
uchar
>
,
max_caller
<
char
>
,
max_caller
<
ushort
>
,
max_caller
<
short
>
,
max_caller
<
int
>
,
max_caller
<
uchar
>
,
max_caller
<
s
char
>
,
max_caller
<
ushort
>
,
max_caller
<
short
>
,
max_caller
<
int
>
,
max_caller
<
float
>
,
max_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
0
);
...
...
@@ -706,7 +706,7 @@ void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& st
typedef
void
(
*
func_t
)(
const
GpuMat
&
src1
,
double
src2
,
GpuMat
&
dst
,
cudaStream_t
stream
);
static
const
func_t
funcs
[]
=
{
max_caller
<
uchar
>
,
max_caller
<
char
>
,
max_caller
<
ushort
>
,
max_caller
<
short
>
,
max_caller
<
int
>
,
max_caller
<
uchar
>
,
max_caller
<
s
char
>
,
max_caller
<
ushort
>
,
max_caller
<
short
>
,
max_caller
<
int
>
,
max_caller
<
float
>
,
max_caller
<
double
>
};
funcs
[
src1
.
depth
()](
src1
,
src2
,
dst
,
StreamAccessor
::
getStream
(
stream
));
...
...
@@ -718,27 +718,48 @@ void cv::gpu::max(const GpuMat& src1, double src2, GpuMat& dst, const Stream& st
namespace
cv
{
namespace
gpu
{
namespace
mathfunc
{
template
<
typename
T
>
void
threshold_gpu
(
const
DevMem2D
&
src
,
const
DevMem2D
&
dst
,
float
thresh
,
float
maxVal
,
int
type
,
void
threshold_gpu
(
const
DevMem2D
&
src
,
const
DevMem2D
&
dst
,
T
thresh
,
T
maxVal
,
int
type
,
cudaStream_t
stream
);
}}}
namespace
{
template
<
typename
T
>
void
threshold_caller
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
double
thresh
,
double
maxVal
,
int
type
,
cudaStream_t
stream
=
0
)
cudaStream_t
stream
)
{
using
namespace
cv
::
gpu
::
mathfunc
;
mathfunc
::
threshold_gpu
<
T
>
(
src
,
dst
,
saturate_cast
<
T
>
(
thresh
),
saturate_cast
<
T
>
(
maxVal
),
type
,
stream
);
}
}
typedef
void
(
*
caller_t
)(
const
DevMem2D
&
src
,
const
DevMem2D
&
dst
,
float
thresh
,
float
maxVal
,
int
type
,
double
cv
::
gpu
::
threshold
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
double
thresh
,
double
maxVal
,
int
type
)
{
if
(
src
.
type
()
==
CV_32FC1
&&
type
==
THRESH_TRUNC
)
{
dst
.
create
(
src
.
size
(),
src
.
type
());
NppiSize
sz
;
sz
.
width
=
src
.
cols
;
sz
.
height
=
src
.
rows
;
nppSafeCall
(
nppiThreshold_32f_C1R
(
src
.
ptr
<
Npp32f
>
(),
src
.
step
,
dst
.
ptr
<
Npp32f
>
(),
dst
.
step
,
sz
,
static_cast
<
Npp32f
>
(
thresh
),
NPP_CMP_GREATER
)
);
cudaSafeCall
(
cudaThreadSynchronize
()
);
}
else
{
typedef
void
(
*
caller_t
)(
const
GpuMat
&
src
,
GpuMat
&
dst
,
double
thresh
,
double
maxVal
,
int
type
,
cudaStream_t
stream
);
static
const
caller_t
callers
[]
=
{
threshold_gpu
<
unsigned
char
>
,
threshold_gpu
<
signed
char
>
,
threshold_gpu
<
unsigned
short
>
,
threshold_gpu
<
short
>
,
threshold_gpu
<
int
>
,
threshold_gpu
<
float
>
,
0
threshold_caller
<
unsigned
char
>
,
threshold_caller
<
signed
char
>
,
threshold_caller
<
unsigned
short
>
,
threshold_caller
<
short
>
,
threshold_caller
<
int
>
,
threshold_caller
<
float
>
,
threshold_caller
<
double
>
};
CV_Assert
(
src
.
channels
()
==
1
&&
src
.
depth
()
<
CV_64F
);
CV_Assert
(
src
.
channels
()
==
1
&&
src
.
depth
()
<
=
CV_64F
);
CV_Assert
(
type
<=
THRESH_TOZERO_INV
);
dst
.
create
(
src
.
size
(),
src
.
type
());
...
...
@@ -749,36 +770,36 @@ namespace
maxVal
=
cvRound
(
maxVal
);
}
callers
[
src
.
depth
()](
src
,
dst
,
static_cast
<
float
>
(
thresh
),
static_cast
<
float
>
(
maxVal
),
type
,
stream
);
callers
[
src
.
depth
()](
src
,
dst
,
thresh
,
maxVal
,
type
,
0
);
}
return
thresh
;
}
double
cv
::
gpu
::
threshold
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
double
thresh
,
double
maxVal
,
int
type
)
double
cv
::
gpu
::
threshold
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
double
thresh
,
double
maxVal
,
int
type
,
const
Stream
&
stream
)
{
if
(
src
.
type
()
==
CV_32FC1
&&
type
==
THRESH_TRUNC
)
typedef
void
(
*
caller_t
)(
const
GpuMat
&
src
,
GpuMat
&
dst
,
double
thresh
,
double
maxVal
,
int
type
,
cudaStream_t
stream
);
static
const
caller_t
callers
[]
=
{
dst
.
create
(
src
.
size
(),
src
.
type
());
threshold_caller
<
unsigned
char
>
,
threshold_caller
<
signed
char
>
,
threshold_caller
<
unsigned
short
>
,
threshold_caller
<
short
>
,
threshold_caller
<
int
>
,
threshold_caller
<
float
>
,
threshold_caller
<
double
>
};
NppiSize
sz
;
sz
.
width
=
src
.
cols
;
sz
.
height
=
src
.
rows
;
CV_Assert
(
src
.
channels
()
==
1
&&
src
.
depth
()
<=
CV_64F
);
CV_Assert
(
type
<=
THRESH_TOZERO_INV
);
nppSafeCall
(
nppiThreshold_32f_C1R
(
src
.
ptr
<
Npp32f
>
(),
src
.
step
,
dst
.
ptr
<
Npp32f
>
(),
dst
.
step
,
sz
,
static_cast
<
Npp32f
>
(
thresh
),
NPP_CMP_GREATER
)
);
dst
.
create
(
src
.
size
(),
src
.
type
());
cudaSafeCall
(
cudaThreadSynchronize
()
);
}
else
if
(
src
.
depth
()
!=
CV_32F
)
{
threshold_caller
(
src
,
dst
,
thresh
,
maxVal
,
type
);
thresh
=
cvFloor
(
thresh
);
maxVal
=
cvRound
(
maxVal
);
}
return
thresh
;
}
double
cv
::
gpu
::
threshold
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
double
thresh
,
double
maxVal
,
int
type
,
const
Stream
&
stream
)
{
threshold_caller
(
src
,
dst
,
thresh
,
maxVal
,
type
,
StreamAccessor
::
getStream
(
stream
));
callers
[
src
.
depth
()](
src
,
dst
,
thresh
,
maxVal
,
type
,
StreamAccessor
::
getStream
(
stream
));
return
thresh
;
}
...
...
modules/gpu/src/imgproc_gpu.cpp
View file @
deac5d97
...
...
@@ -128,6 +128,8 @@ void cv::gpu::remap(const GpuMat& src, GpuMat& dst, const GpuMat& xmap, const Gp
void
cv
::
gpu
::
meanShiftFiltering
(
const
GpuMat
&
src
,
GpuMat
&
dst
,
int
sp
,
int
sr
,
TermCriteria
criteria
)
{
CV_Assert
(
TargetArchs
::
builtWith
(
COMPUTE_12
)
&&
DeviceInfo
().
supports
(
COMPUTE_12
));
if
(
src
.
empty
()
)
CV_Error
(
CV_StsBadArg
,
"The input image is empty"
);
...
...
@@ -154,6 +156,8 @@ void cv::gpu::meanShiftFiltering(const GpuMat& src, GpuMat& dst, int sp, int sr,
void
cv
::
gpu
::
meanShiftProc
(
const
GpuMat
&
src
,
GpuMat
&
dstr
,
GpuMat
&
dstsp
,
int
sp
,
int
sr
,
TermCriteria
criteria
)
{
CV_Assert
(
TargetArchs
::
builtWith
(
COMPUTE_12
)
&&
DeviceInfo
().
supports
(
COMPUTE_12
));
if
(
src
.
empty
()
)
CV_Error
(
CV_StsBadArg
,
"The input image is empty"
);
...
...
modules/gpu/src/matrix_operations.cpp
View file @
deac5d97
...
...
@@ -87,8 +87,10 @@ namespace cv
{
void
copy_to_with_mask
(
const
DevMem2D
&
src
,
DevMem2D
dst
,
int
depth
,
const
DevMem2D
&
mask
,
int
channels
,
const
cudaStream_t
&
stream
=
0
);
void
set_to_without_mask
(
DevMem2D
dst
,
int
depth
,
const
double
*
scalar
,
int
channels
,
const
cudaStream_t
&
stream
=
0
);
void
set_to_with_mask
(
DevMem2D
dst
,
int
depth
,
const
double
*
scalar
,
const
DevMem2D
&
mask
,
int
channels
,
const
cudaStream_t
&
stream
=
0
);
template
<
typename
T
>
void
set_to_gpu
(
const
DevMem2D
&
mat
,
const
T
*
scalar
,
int
channels
,
cudaStream_t
stream
);
template
<
typename
T
>
void
set_to_gpu
(
const
DevMem2D
&
mat
,
const
T
*
scalar
,
const
DevMem2D
&
mask
,
int
channels
,
cudaStream_t
stream
);
void
convert_gpu
(
const
DevMem2D
&
src
,
int
sdepth
,
const
DevMem2D
&
dst
,
int
ddepth
,
double
alpha
,
double
beta
,
cudaStream_t
stream
=
0
);
}
...
...
@@ -363,9 +365,11 @@ namespace
}
};
template
<
typename
T
>
void
kernelSet
(
GpuMat
&
src
,
const
Scalar
&
s
)
{
matrix_operations
::
set_to_without_mask
(
src
,
src
.
depth
(),
s
.
val
,
src
.
channels
());
Scalar_
<
T
>
sf
=
s
;
matrix_operations
::
set_to_gpu
(
src
,
sf
.
val
,
src
.
channels
(),
0
);
}
template
<
int
SDEPTH
,
int
SCN
>
struct
NppSetMaskFunc
...
...
@@ -412,9 +416,11 @@ namespace
}
};
template
<
typename
T
>
void
kernelSetMask
(
GpuMat
&
src
,
const
Scalar
&
s
,
const
GpuMat
&
mask
)
{
matrix_operations
::
set_to_with_mask
(
src
,
src
.
depth
(),
s
.
val
,
mask
,
src
.
channels
());
Scalar_
<
T
>
sf
=
s
;
matrix_operations
::
set_to_gpu
(
src
,
sf
.
val
,
mask
,
src
.
channels
(),
0
);
}
}
...
...
@@ -433,13 +439,13 @@ GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask)
typedef
void
(
*
set_caller_t
)(
GpuMat
&
src
,
const
Scalar
&
s
);
static
const
set_caller_t
set_callers
[
8
][
4
]
=
{
{
NppSet
<
CV_8U
,
1
,
nppiSet_8u_C1R
>::
set
,
kernelSet
,
kernelSet
,
NppSet
<
CV_8U
,
4
,
nppiSet_8u_C4R
>::
set
},
{
kernelSet
,
kernelSet
,
kernelSet
,
kernelSet
},
{
NppSet
<
CV_16U
,
1
,
nppiSet_16u_C1R
>::
set
,
kernelSet
,
kernelSet
,
NppSet
<
CV_16U
,
4
,
nppiSet_16u_C4R
>::
set
},
{
NppSet
<
CV_16S
,
1
,
nppiSet_16s_C1R
>::
set
,
kernelSet
,
kernelSet
,
NppSet
<
CV_16S
,
4
,
nppiSet_16s_C4R
>::
set
},
{
NppSet
<
CV_32S
,
1
,
nppiSet_32s_C1R
>::
set
,
kernelSet
,
kernelSet
,
NppSet
<
CV_32S
,
4
,
nppiSet_32s_C4R
>::
set
},
{
NppSet
<
CV_32F
,
1
,
nppiSet_32f_C1R
>::
set
,
kernelSet
,
kernelSet
,
NppSet
<
CV_32F
,
4
,
nppiSet_32f_C4R
>::
set
},
{
kernelSet
,
kernelSet
,
kernelSet
,
kernelSet
},
{
NppSet
<
CV_8U
,
1
,
nppiSet_8u_C1R
>::
set
,
kernelSet
<
uchar
>
,
kernelSet
<
uchar
>
,
NppSet
<
CV_8U
,
4
,
nppiSet_8u_C4R
>::
set
},
{
kernelSet
<
schar
>
,
kernelSet
<
schar
>
,
kernelSet
<
schar
>
,
kernelSet
<
schar
>
},
{
NppSet
<
CV_16U
,
1
,
nppiSet_16u_C1R
>::
set
,
kernelSet
<
ushort
>
,
kernelSet
<
ushort
>
,
NppSet
<
CV_16U
,
4
,
nppiSet_16u_C4R
>::
set
},
{
NppSet
<
CV_16S
,
1
,
nppiSet_16s_C1R
>::
set
,
kernelSet
<
short
>
,
kernelSet
<
short
>
,
NppSet
<
CV_16S
,
4
,
nppiSet_16s_C4R
>::
set
},
{
NppSet
<
CV_32S
,
1
,
nppiSet_32s_C1R
>::
set
,
kernelSet
<
int
>
,
kernelSet
<
int
>
,
NppSet
<
CV_32S
,
4
,
nppiSet_32s_C4R
>::
set
},
{
NppSet
<
CV_32F
,
1
,
nppiSet_32f_C1R
>::
set
,
kernelSet
<
float
>
,
kernelSet
<
float
>
,
NppSet
<
CV_32F
,
4
,
nppiSet_32f_C4R
>::
set
},
{
kernelSet
<
double
>
,
kernelSet
<
double
>
,
kernelSet
<
double
>
,
kernelSet
<
double
>
},
{
0
,
0
,
0
,
0
}
};
set_callers
[
depth
()][
channels
()
-
1
](
*
this
,
s
);
...
...
@@ -449,13 +455,13 @@ GpuMat& GpuMat::setTo(const Scalar& s, const GpuMat& mask)
typedef
void
(
*
set_caller_t
)(
GpuMat
&
src
,
const
Scalar
&
s
,
const
GpuMat
&
mask
);
static
const
set_caller_t
set_callers
[
8
][
4
]
=
{
{
NppSetMask
<
CV_8U
,
1
,
nppiSet_8u_C1MR
>::
set
,
kernelSetMask
,
kernelSetMask
,
NppSetMask
<
CV_8U
,
4
,
nppiSet_8u_C4MR
>::
set
},
{
kernelSetMask
,
kernelSetMask
,
kernelSetMask
,
kernelSetMask
},
{
NppSetMask
<
CV_16U
,
1
,
nppiSet_16u_C1MR
>::
set
,
kernelSetMask
,
kernelSetMask
,
NppSetMask
<
CV_16U
,
4
,
nppiSet_16u_C4MR
>::
set
},
{
NppSetMask
<
CV_16S
,
1
,
nppiSet_16s_C1MR
>::
set
,
kernelSetMask
,
kernelSetMask
,
NppSetMask
<
CV_16S
,
4
,
nppiSet_16s_C4MR
>::
set
},
{
NppSetMask
<
CV_32S
,
1
,
nppiSet_32s_C1MR
>::
set
,
kernelSetMask
,
kernelSetMask
,
NppSetMask
<
CV_32S
,
4
,
nppiSet_32s_C4MR
>::
set
},
{
NppSetMask
<
CV_32F
,
1
,
nppiSet_32f_C1MR
>::
set
,
kernelSetMask
,
kernelSetMask
,
NppSetMask
<
CV_32F
,
4
,
nppiSet_32f_C4MR
>::
set
},
{
kernelSetMask
,
kernelSetMask
,
kernelSetMask
,
kernelSetMask
},
{
NppSetMask
<
CV_8U
,
1
,
nppiSet_8u_C1MR
>::
set
,
kernelSetMask
<
uchar
>
,
kernelSetMask
<
uchar
>
,
NppSetMask
<
CV_8U
,
4
,
nppiSet_8u_C4MR
>::
set
},
{
kernelSetMask
<
schar
>
,
kernelSetMask
<
schar
>
,
kernelSetMask
<
schar
>
,
kernelSetMask
<
schar
>
},
{
NppSetMask
<
CV_16U
,
1
,
nppiSet_16u_C1MR
>::
set
,
kernelSetMask
<
ushort
>
,
kernelSetMask
<
ushort
>
,
NppSetMask
<
CV_16U
,
4
,
nppiSet_16u_C4MR
>::
set
},
{
NppSetMask
<
CV_16S
,
1
,
nppiSet_16s_C1MR
>::
set
,
kernelSetMask
<
short
>
,
kernelSetMask
<
short
>
,
NppSetMask
<
CV_16S
,
4
,
nppiSet_16s_C4MR
>::
set
},
{
NppSetMask
<
CV_32S
,
1
,
nppiSet_32s_C1MR
>::
set
,
kernelSetMask
<
int
>
,
kernelSetMask
<
int
>
,
NppSetMask
<
CV_32S
,
4
,
nppiSet_32s_C4MR
>::
set
},
{
NppSetMask
<
CV_32F
,
1
,
nppiSet_32f_C1MR
>::
set
,
kernelSetMask
<
float
>
,
kernelSetMask
<
float
>
,
NppSetMask
<
CV_32F
,
4
,
nppiSet_32f_C4MR
>::
set
},
{
kernelSetMask
<
double
>
,
kernelSetMask
<
double
>
,
kernelSetMask
<
double
>
,
kernelSetMask
<
double
>
},
{
0
,
0
,
0
,
0
}
};
set_callers
[
depth
()][
channels
()
-
1
](
*
this
,
s
,
mask
);
...
...
modules/gpu/src/mssegmentation.cpp
View file @
deac5d97
...
...
@@ -227,6 +227,8 @@ inline int dist2(const cv::Vec2s& lhs, const cv::Vec2s& rhs)
void
cv
::
gpu
::
meanShiftSegmentation
(
const
GpuMat
&
src
,
Mat
&
dst
,
int
sp
,
int
sr
,
int
minsize
,
TermCriteria
criteria
)
{
CV_Assert
(
TargetArchs
::
builtWith
(
COMPUTE_12
)
&&
DeviceInfo
().
supports
(
COMPUTE_12
));
CV_Assert
(
src
.
type
()
==
CV_8UC4
);
const
int
nrows
=
src
.
rows
;
const
int
ncols
=
src
.
cols
;
...
...
modules/gpu/src/opencv2/gpu/device/border_interpolate.hpp
View file @
deac5d97
...
...
@@ -40,6 +40,9 @@
//
//M*/
#include "opencv2/gpu/device/saturate_cast.hpp"
#include "opencv2/gpu/device/vecmath.hpp"
namespace
cv
{
namespace
gpu
...
...
@@ -48,7 +51,7 @@ namespace cv
{
struct
BrdReflect101
{
BrdReflect101
(
int
len
)
:
last
(
len
-
1
)
{}
explicit
BrdReflect101
(
int
len
)
:
last
(
len
-
1
)
{}
__device__
int
idx_low
(
int
i
)
const
{
...
...
@@ -62,7 +65,7 @@ namespace cv
__device__
int
idx
(
int
i
)
const
{
return
abs
(
idx_high
(
i
));
return
idx_low
(
idx_high
(
i
));
}
bool
is_range_safe
(
int
mini
,
int
maxi
)
const
...
...
@@ -70,49 +73,55 @@ namespace cv
return
-
last
<=
mini
&&
maxi
<=
2
*
last
;
}
private
:
int
last
;
};
template
<
typename
T
>
template
<
typename
D
>
struct
BrdRowReflect101
:
BrdReflect101
{
BrdRowReflect101
(
int
len
)
:
BrdReflect101
(
len
)
{}
explicit
BrdRowReflect101
(
int
len
)
:
BrdReflect101
(
len
)
{}
__device__
float
at_low
(
int
i
,
const
T
*
data
)
const
template
<
typename
T
>
__device__
D
at_low
(
int
i
,
const
T
*
data
)
const
{
return
data
[
idx_low
(
i
)]
;
return
saturate_cast
<
D
>
(
data
[
idx_low
(
i
)])
;
}
__device__
float
at_high
(
int
i
,
const
T
*
data
)
const
template
<
typename
T
>
__device__
D
at_high
(
int
i
,
const
T
*
data
)
const
{
return
data
[
idx_high
(
i
)]
;
return
saturate_cast
<
D
>
(
data
[
idx_high
(
i
)])
;
}
};
template
<
typename
T
>
template
<
typename
D
>
struct
BrdColReflect101
:
BrdReflect101
{
BrdColReflect101
(
int
len
,
int
step
)
:
BrdReflect101
(
len
),
step
(
step
)
{}
__device__
float
at_low
(
int
i
,
const
T
*
data
)
const
template
<
typename
T
>
__device__
D
at_low
(
int
i
,
const
T
*
data
)
const
{
return
data
[
idx_low
(
i
)
*
step
]
;
return
saturate_cast
<
D
>
(
data
[
idx_low
(
i
)
*
step
])
;
}
__device__
float
at_high
(
int
i
,
const
T
*
data
)
const
template
<
typename
T
>
__device__
D
at_high
(
int
i
,
const
T
*
data
)
const
{
return
data
[
idx_high
(
i
)
*
step
]
;
return
saturate_cast
<
D
>
(
data
[
idx_high
(
i
)
*
step
])
;
}
private
:
int
step
;
};
struct
BrdReplicate
{
BrdReplicate
(
int
len
)
:
last
(
len
-
1
)
{}
explicit
BrdReplicate
(
int
len
)
:
last
(
len
-
1
)
{}
__device__
int
idx_low
(
int
i
)
const
{
...
...
@@ -126,7 +135,7 @@ namespace cv
__device__
int
idx
(
int
i
)
const
{
return
max
(
min
(
i
,
last
),
0
);
return
idx_low
(
idx_high
(
i
)
);
}
bool
is_range_safe
(
int
mini
,
int
maxi
)
const
...
...
@@ -134,42 +143,104 @@ namespace cv
return
true
;
}
private
:
int
last
;
};
template
<
typename
T
>
template
<
typename
D
>
struct
BrdRowReplicate
:
BrdReplicate
{
BrdRowReplicate
(
int
len
)
:
BrdReplicate
(
len
)
{}
explicit
BrdRowReplicate
(
int
len
)
:
BrdReplicate
(
len
)
{}
__device__
float
at_low
(
int
i
,
const
T
*
data
)
const
template
<
typename
T
>
__device__
D
at_low
(
int
i
,
const
T
*
data
)
const
{
return
data
[
idx_low
(
i
)]
;
return
saturate_cast
<
D
>
(
data
[
idx_low
(
i
)])
;
}
__device__
float
at_high
(
int
i
,
const
T
*
data
)
const
template
<
typename
T
>
__device__
D
at_high
(
int
i
,
const
T
*
data
)
const
{
return
data
[
idx_high
(
i
)]
;
return
saturate_cast
<
D
>
(
data
[
idx_high
(
i
)])
;
}
};
template
<
typename
T
>
template
<
typename
D
>
struct
BrdColReplicate
:
BrdReplicate
{
BrdColReplicate
(
int
len
,
int
step
)
:
BrdReplicate
(
len
),
step
(
step
)
{}
__device__
float
at_low
(
int
i
,
const
T
*
data
)
const
template
<
typename
T
>
__device__
D
at_low
(
int
i
,
const
T
*
data
)
const
{
return
data
[
idx_low
(
i
)
*
step
]
;
return
saturate_cast
<
D
>
(
data
[
idx_low
(
i
)
*
step
])
;
}
__device__
float
at_high
(
int
i
,
const
T
*
data
)
const
template
<
typename
T
>
__device__
D
at_high
(
int
i
,
const
T
*
data
)
const
{
return
saturate_cast
<
D
>
(
data
[
idx_high
(
i
)
*
step
]);
}
private
:
int
step
;
};
template
<
typename
D
>
struct
BrdRowConstant
{
explicit
BrdRowConstant
(
int
len_
,
const
D
&
val_
=
VecTraits
<
D
>::
all
(
0
))
:
len
(
len_
),
val
(
val_
)
{}
template
<
typename
T
>
__device__
D
at_low
(
int
i
,
const
T
*
data
)
const
{
return
data
[
idx_high
(
i
)
*
step
];
return
i
>=
0
?
saturate_cast
<
D
>
(
data
[
i
])
:
val
;
}
template
<
typename
T
>
__device__
D
at_high
(
int
i
,
const
T
*
data
)
const
{
return
i
<
len
?
saturate_cast
<
D
>
(
data
[
i
])
:
val
;
}
bool
is_range_safe
(
int
mini
,
int
maxi
)
const
{
return
true
;
}
private
:
int
len
;
D
val
;
};
template
<
typename
D
>
struct
BrdColConstant
{
BrdColConstant
(
int
len_
,
int
step_
,
const
D
&
val_
=
VecTraits
<
D
>::
all
(
0
))
:
len
(
len_
),
step
(
step_
),
val
(
val_
)
{}
template
<
typename
T
>
__device__
D
at_low
(
int
i
,
const
T
*
data
)
const
{
return
i
>=
0
?
saturate_cast
<
D
>
(
data
[
i
*
step
])
:
val
;
}
template
<
typename
T
>
__device__
D
at_high
(
int
i
,
const
T
*
data
)
const
{
return
i
<
len
?
saturate_cast
<
D
>
(
data
[
i
*
step
])
:
val
;
}
bool
is_range_safe
(
int
mini
,
int
maxi
)
const
{
return
true
;
}
private
:
int
len
;
int
step
;
D
val
;
};
}
}
...
...
modules/gpu/src/opencv2/gpu/device/transform.hpp
View file @
deac5d97
...
...
@@ -329,6 +329,7 @@ namespace cv
grid
.
y
=
divUp
(
src
.
rows
,
threads
.
y
);
device
::
transformSimple
<
T
,
D
><<<
grid
,
threads
,
0
,
stream
>>>
(
src
,
dst
,
mask
,
op
);
cudaSafeCall
(
cudaGetLastError
()
);
if
(
stream
==
0
)
cudaSafeCall
(
cudaThreadSynchronize
()
);
...
...
@@ -345,6 +346,7 @@ namespace cv
grid
.
y
=
divUp
(
src1
.
rows
,
threads
.
y
);
device
::
transformSimple
<
T1
,
T2
,
D
><<<
grid
,
threads
,
0
,
stream
>>>
(
src1
,
src2
,
dst
,
mask
,
op
);
cudaSafeCall
(
cudaGetLastError
()
);
if
(
stream
==
0
)
cudaSafeCall
(
cudaThreadSynchronize
()
);
...
...
@@ -365,6 +367,7 @@ namespace cv
grid
.
y
=
divUp
(
src
.
rows
,
threads
.
y
);
device
::
transformSmart
<
T
,
D
><<<
grid
,
threads
,
0
,
stream
>>>
(
src
,
dst
,
mask
,
op
);
cudaSafeCall
(
cudaGetLastError
()
);
if
(
stream
==
0
)
cudaSafeCall
(
cudaThreadSynchronize
()
);
...
...
@@ -383,6 +386,7 @@ namespace cv
grid
.
y
=
divUp
(
src1
.
rows
,
threads
.
y
);
device
::
transformSmart
<
T1
,
T2
,
D
><<<
grid
,
threads
,
0
,
stream
>>>
(
src1
,
src2
,
dst
,
mask
,
op
);
cudaSafeCall
(
cudaGetLastError
()
);
if
(
stream
==
0
)
cudaSafeCall
(
cudaThreadSynchronize
()
);
...
...
modules/gpu/src/surf.cpp
View file @
deac5d97
...
...
@@ -65,6 +65,7 @@ namespace cv { namespace gpu { namespace surf
dim3
calcBlockSize
(
int
nIntervals
);
void
fasthessian_gpu
(
PtrStepf
hessianBuffer
,
int
x_size
,
int
y_size
,
const
dim3
&
threads
);
void
fasthessian_gpu_old
(
PtrStepf
hessianBuffer
,
int
x_size
,
int
y_size
,
const
dim3
&
threadsOld
);
void
nonmaxonly_gpu
(
PtrStepf
hessianBuffer
,
int4
*
maxPosBuffer
,
unsigned
int
&
maxCounter
,
int
x_size
,
int
y_size
,
bool
use_mask
,
const
dim3
&
threads
);
...
...
@@ -75,6 +76,7 @@ namespace cv { namespace gpu { namespace surf
void
find_orientation_gpu
(
KeyPoint_GPU
*
features
,
int
nFeatures
);
void
compute_descriptors_gpu
(
const
DevMem2Df
&
descriptors
,
const
KeyPoint_GPU
*
features
,
int
nFeatures
);
void
compute_descriptors_gpu_old
(
const
DevMem2Df
&
descriptors
,
const
KeyPoint_GPU
*
features
,
int
nFeatures
);
}}}
using
namespace
cv
::
gpu
::
surf
;
...
...
@@ -170,6 +172,10 @@ namespace
void
detectKeypoints
(
GpuMat
&
keypoints
)
{
typedef
void
(
*
fasthessian_t
)(
PtrStepf
hessianBuffer
,
int
x_size
,
int
y_size
,
const
dim3
&
threads
);
const
fasthessian_t
fasthessian
=
DeviceInfo
().
supports
(
COMPUTE_13
)
?
fasthessian_gpu
:
fasthessian_gpu_old
;
dim3
threads
=
calcBlockSize
(
nIntervals
);
for
(
int
octave
=
0
;
octave
<
nOctaves
;
++
octave
)
{
...
...
@@ -192,7 +198,7 @@ namespace
uploadConstant
(
"cv::gpu::surf::c_border"
,
border
);
uploadConstant
(
"cv::gpu::surf::c_step"
,
step
);
fasthessian
_gpu
(
hessianBuffer
,
x_size
,
y_size
,
threads
);
fasthessian
(
hessianBuffer
,
x_size
,
y_size
,
threads
);
// Reset the candidate count.
maxCounter
=
0
;
...
...
@@ -201,10 +207,13 @@ namespace
maxCounter
=
std
::
min
(
maxCounter
,
static_cast
<
unsigned
int
>
(
max_candidates
));
fh_interp_extremum_gpu
(
hessianBuffer
,
maxPosBuffer
.
ptr
<
int4
>
(),
maxCounter
,
featuresBuffer
.
ptr
<
KeyPoint_GPU
>
(),
featureCounter
);
if
(
maxCounter
>
0
)
{
fh_interp_extremum_gpu
(
hessianBuffer
,
maxPosBuffer
.
ptr
<
int4
>
(),
maxCounter
,
featuresBuffer
.
ptr
<
KeyPoint_GPU
>
(),
featureCounter
);
featureCounter
=
std
::
min
(
featureCounter
,
static_cast
<
unsigned
int
>
(
max_features
));
featureCounter
=
std
::
min
(
featureCounter
,
static_cast
<
unsigned
int
>
(
max_features
));
}
}
if
(
featureCounter
>
0
)
...
...
@@ -221,10 +230,16 @@ namespace
void
computeDescriptors
(
const
GpuMat
&
keypoints
,
GpuMat
&
descriptors
,
int
descriptorSize
)
{
typedef
void
(
*
compute_descriptors_t
)(
const
DevMem2Df
&
descriptors
,
const
KeyPoint_GPU
*
features
,
int
nFeatures
);
const
compute_descriptors_t
compute_descriptors
=
DeviceInfo
().
supports
(
COMPUTE_13
)
?
compute_descriptors_gpu
:
compute_descriptors_gpu_old
;
if
(
keypoints
.
cols
>
0
)
{
descriptors
.
create
(
keypoints
.
cols
,
descriptorSize
,
CV_32F
);
compute_descriptors
_gpu
(
descriptors
,
keypoints
.
ptr
<
KeyPoint_GPU
>
(),
keypoints
.
cols
);
compute_descriptors
(
descriptors
,
keypoints
.
ptr
<
KeyPoint_GPU
>
(),
keypoints
.
cols
);
}
}
...
...
tests/gpu/src/brute_force_matcher.cpp
View file @
deac5d97
...
...
@@ -384,6 +384,14 @@ void CV_GpuBruteForceMatcherTest::knnMatchTest( const GpuMat& query, const GpuMa
void
CV_GpuBruteForceMatcherTest
::
radiusMatchTest
(
const
GpuMat
&
query
,
const
GpuMat
&
train
)
{
bool
atomics_ok
=
TargetArchs
::
builtWith
(
ATOMICS
)
&&
DeviceInfo
().
supports
(
ATOMICS
);
if
(
!
atomics_ok
)
{
ts
->
printf
(
CvTS
::
CONSOLE
,
"
\n
Code and device atomics support is required for radiusMatch (CC >= 1.1)"
);
ts
->
set_failed_test_info
(
CvTS
::
FAIL_GENERIC
);
return
;
}
dmatcher
.
clear
();
// test const version of match()
{
...
...
@@ -501,15 +509,24 @@ void CV_GpuBruteForceMatcherTest::dataTest(int dim)
void
CV_GpuBruteForceMatcherTest
::
run
(
int
)
{
emptyDataTest
();
dataTest
(
50
);
dataTest
(
64
);
dataTest
(
100
);
dataTest
(
128
);
dataTest
(
200
);
dataTest
(
256
);
dataTest
(
300
);
try
{
emptyDataTest
();
dataTest
(
50
);
dataTest
(
64
);
dataTest
(
100
);
dataTest
(
128
);
dataTest
(
200
);
dataTest
(
256
);
dataTest
(
300
);
}
catch
(
cv
::
Exception
&
e
)
{
if
(
!
check_and_treat_gpu_exception
(
e
,
ts
))
throw
;
return
;
}
}
CV_GpuBruteForceMatcherTest
CV_GpuBruteForceMatcher_test
;
tests/gpu/src/features2d.cpp
View file @
deac5d97
...
...
@@ -154,7 +154,7 @@ void CV_GPU_SURFTest::compareKeypointSets(const vector<KeyPoint>& validKeypoints
return
;
}
if
(
norm
(
validDescriptors
.
row
(
v
),
calcDescriptors
.
row
(
nearestIdx
),
NORM_L2
)
>
1.
0
f
)
if
(
norm
(
validDescriptors
.
row
(
v
),
calcDescriptors
.
row
(
nearestIdx
),
NORM_L2
)
>
1.
5
f
)
{
ts
->
printf
(
CvTS
::
LOG
,
"Bad descriptors accuracy.
\n
"
);
ts
->
set_failed_test_info
(
CvTS
::
FAIL_BAD_ACCURACY
);
...
...
@@ -221,10 +221,19 @@ void CV_GPU_SURFTest::regressionTest(SURF_GPU& fdetector)
void
CV_GPU_SURFTest
::
run
(
int
/*start_from*/
)
{
SURF_GPU
fdetector
;
try
{
SURF_GPU
fdetector
;
emptyDataTest
(
fdetector
);
regressionTest
(
fdetector
);
emptyDataTest
(
fdetector
);
regressionTest
(
fdetector
);
}
catch
(
cv
::
Exception
&
e
)
{
if
(
!
check_and_treat_gpu_exception
(
e
,
ts
))
throw
;
return
;
}
}
CV_GPU_SURFTest
CV_GPU_SURF_test
;
tests/gpu/src/gputest_main.cpp
View file @
deac5d97
...
...
@@ -43,15 +43,15 @@
CvTS
test_system
(
"gpu"
);
const
char
*
blacklist
[]
=
{
"GPU-NppImageCanny"
,
// NPP_TEXTURE_BIND_ERROR
0
};
//
const char* blacklist[] =
//
{
// "GPU-NVidia",
//
0
//
};
int
main
(
int
argc
,
char
**
argv
)
{
return
test_system
.
run
(
argc
,
argv
,
blacklist
);
return
test_system
.
run
(
argc
,
argv
);
}
/* End of file. */
tests/gpu/src/meanshift.cpp
View file @
deac5d97
...
...
@@ -43,6 +43,9 @@
#include <iostream>
#include <string>
using
namespace
cv
;
using
namespace
cv
::
gpu
;
struct
CV_GpuMeanShiftTest
:
public
CvTest
{
...
...
@@ -50,6 +53,14 @@ struct CV_GpuMeanShiftTest : public CvTest
void
run
(
int
)
{
bool
cc12_ok
=
TargetArchs
::
builtWith
(
COMPUTE_12
)
&&
DeviceInfo
().
supports
(
COMPUTE_12
);
if
(
!
cc12_ok
)
{
ts
->
printf
(
CvTS
::
CONSOLE
,
"
\n
Compute capability 1.2 is required"
);
ts
->
set_failed_test_info
(
CvTS
::
FAIL_GENERIC
);
return
;
}
int
spatialRad
=
30
;
int
colorRad
=
30
;
...
...
@@ -134,6 +145,14 @@ struct CV_GpuMeanShiftProcTest : public CvTest
void
run
(
int
)
{
bool
cc12_ok
=
TargetArchs
::
builtWith
(
COMPUTE_12
)
&&
DeviceInfo
().
supports
(
COMPUTE_12
);
if
(
!
cc12_ok
)
{
ts
->
printf
(
CvTS
::
CONSOLE
,
"
\n
Compute capability 1.2 is required"
);
ts
->
set_failed_test_info
(
CvTS
::
FAIL_GENERIC
);
return
;
}
int
spatialRad
=
30
;
int
colorRad
=
30
;
...
...
tests/gpu/src/mssegmentation.cpp
View file @
deac5d97
...
...
@@ -54,6 +54,14 @@ struct CV_GpuMeanShiftSegmentationTest : public CvTest {
{
try
{
bool
cc12_ok
=
TargetArchs
::
builtWith
(
COMPUTE_12
)
&&
DeviceInfo
().
supports
(
COMPUTE_12
);
if
(
!
cc12_ok
)
{
ts
->
printf
(
CvTS
::
CONSOLE
,
"
\n
Compute capability 1.2 is required"
);
ts
->
set_failed_test_info
(
CvTS
::
FAIL_GENERIC
);
return
;
}
Mat
img_rgb
=
imread
(
string
(
ts
->
get_data_path
())
+
"meanshift/cones.png"
);
if
(
img_rgb
.
empty
())
{
...
...
tests/gpu/src/operator_convert_to.cpp
View file @
deac5d97
...
...
@@ -91,14 +91,14 @@ void CV_GpuMatOpConvertToTest::run(int /* start_from */)
Mat
cpumatdst
;
GpuMat
gpumatdst
;
cpumatsrc
.
convertTo
(
cpumatdst
,
dst_type
);
gpumatsrc
.
convertTo
(
gpumatdst
,
dst_type
);
cpumatsrc
.
convertTo
(
cpumatdst
,
dst_type
,
0.5
,
3.0
);
gpumatsrc
.
convertTo
(
gpumatdst
,
dst_type
,
0.5
,
3.0
);
double
r
=
norm
(
cpumatdst
,
gpumatdst
,
NORM_INF
);
if
(
r
>
1
)
{
ts
->
printf
(
CvTS
::
LOG
,
"
\n
FAILED: SRC_TYPE=%sC%d DST_TYPE=%s NORM = %
d
\n
"
,
"
\n
FAILED: SRC_TYPE=%sC%d DST_TYPE=%s NORM = %
f
\n
"
,
types_str
[
i
],
c
,
types_str
[
j
],
r
);
passed
=
false
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment