Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
d3c4e907
Commit
d3c4e907
authored
Oct 12, 2011
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
new optimized implementation of BruteForceMatcher_GPU (~2-3x faster)
parent
89be84a3
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
2562 additions
and
1066 deletions
+2562
-1066
gpu.hpp
modules/gpu/include/opencv2/gpu/gpu.hpp
+45
-43
perf_features2d.cpp
modules/gpu/perf/perf_features2d.cpp
+7
-7
brute_force_matcher.cpp
modules/gpu/src/brute_force_matcher.cpp
+384
-231
bf_knnmatch.cu
modules/gpu/src/cuda/bf_knnmatch.cu
+902
-222
bf_match.cu
modules/gpu/src/cuda/bf_match.cu
+583
-223
bf_radius_match.cu
modules/gpu/src/cuda/bf_radius_match.cu
+274
-330
utility_detail.hpp
modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
+334
-0
utility.hpp
modules/gpu/src/opencv2/gpu/device/utility.hpp
+7
-1
matchers.cpp
modules/stitching/src/matchers.cpp
+2
-2
tests.cpp
samples/gpu/performance/tests.cpp
+24
-7
No files found.
modules/gpu/include/opencv2/gpu/gpu.hpp
View file @
d3c4e907
...
...
@@ -1221,26 +1221,24 @@ namespace cv
explicit
BruteForceMatcher_GPU_base
(
DistType
distType
=
L2Dist
);
// Add descriptors to train descriptor collection
.
// Add descriptors to train descriptor collection
void
add
(
const
std
::
vector
<
GpuMat
>&
descCollection
);
// Get train descriptors collection
.
// Get train descriptors collection
const
std
::
vector
<
GpuMat
>&
getTrainDescriptors
()
const
;
// Clear train descriptors collection
.
// Clear train descriptors collection
void
clear
();
// Return true if there are not train descriptors in collection
.
// Return true if there are not train descriptors in collection
bool
empty
()
const
;
// Return true if the matcher supports mask in match methods
.
// Return true if the matcher supports mask in match methods
bool
isMaskSupported
()
const
;
// Find one best match for each query descriptor.
// trainIdx.at<int>(0, queryIdx) will contain best train index for queryIdx
// distance.at<float>(0, queryIdx) will contain distance
void
matchSingle
(
const
GpuMat
&
queryDescs
,
const
GpuMat
&
trainDescs
,
GpuMat
&
trainIdx
,
GpuMat
&
distance
,
// Find one best match for each query descriptor
void
matchSingle
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
GpuMat
&
trainIdx
,
GpuMat
&
distance
,
const
GpuMat
&
mask
=
GpuMat
(),
Stream
&
stream
=
Stream
::
Null
());
// Download trainIdx and distance and convert it to CPU vector with DMatch
...
...
@@ -1248,21 +1246,16 @@ namespace cv
// Convert trainIdx and distance to vector with DMatch
static
void
matchConvert
(
const
Mat
&
trainIdx
,
const
Mat
&
distance
,
std
::
vector
<
DMatch
>&
matches
);
// Find one best match for each query descriptor.
void
match
(
const
GpuMat
&
queryDescs
,
const
GpuMat
&
trainDescs
,
std
::
vector
<
DMatch
>&
matches
,
const
GpuMat
&
mask
=
GpuMat
());
// Find one best match for each query descriptor
void
match
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
std
::
vector
<
DMatch
>&
matches
,
const
GpuMat
&
mask
=
GpuMat
());
// Make gpu collection of trains and masks in suitable format for matchCollection function
void
makeGpuCollection
(
GpuMat
&
trainCollection
,
GpuMat
&
maskCollection
,
const
vector
<
GpuMat
>&
masks
=
std
::
vector
<
GpuMat
>
());
void
makeGpuCollection
(
GpuMat
&
trainCollection
,
GpuMat
&
maskCollection
,
const
std
::
vector
<
GpuMat
>&
masks
=
std
::
vector
<
GpuMat
>
());
// Find one best match from train collection for each query descriptor.
// trainIdx.at<int>(0, queryIdx) will contain best train index for queryIdx
// imgIdx.at<int>(0, queryIdx) will contain best image index for queryIdx
// distance.at<float>(0, queryIdx) will contain distance
void
matchCollection
(
const
GpuMat
&
queryDescs
,
const
GpuMat
&
trainCollection
,
// Find one best match from train collection for each query descriptor
void
matchCollection
(
const
GpuMat
&
query
,
const
GpuMat
&
trainCollection
,
GpuMat
&
trainIdx
,
GpuMat
&
imgIdx
,
GpuMat
&
distance
,
const
GpuMat
&
mask
Collection
,
Stream
&
stream
=
Stream
::
Null
());
const
GpuMat
&
mask
s
=
GpuMat
()
,
Stream
&
stream
=
Stream
::
Null
());
// Download trainIdx, imgIdx and distance and convert it to vector with DMatch
static
void
matchDownload
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
imgIdx
,
const
GpuMat
&
distance
,
std
::
vector
<
DMatch
>&
matches
);
...
...
@@ -1270,17 +1263,12 @@ namespace cv
static
void
matchConvert
(
const
Mat
&
trainIdx
,
const
Mat
&
imgIdx
,
const
Mat
&
distance
,
std
::
vector
<
DMatch
>&
matches
);
// Find one best match from train collection for each query descriptor.
void
match
(
const
GpuMat
&
query
Descs
,
std
::
vector
<
DMatch
>&
matches
,
const
std
::
vector
<
GpuMat
>&
masks
=
std
::
vector
<
GpuMat
>
());
void
match
(
const
GpuMat
&
query
,
std
::
vector
<
DMatch
>&
matches
,
const
std
::
vector
<
GpuMat
>&
masks
=
std
::
vector
<
GpuMat
>
());
// Find k best matches for each query descriptor (in increasing order of distances).
// trainIdx.at<int>(queryIdx, i) will contain index of i'th best trains (i < k).
// distance.at<float>(queryIdx, i) will contain distance.
// allDist is a buffer to store all distance between query descriptors and train descriptors
// it have size (nQuery,nTrain) and CV_32F type
// allDist.at<float>(queryIdx, trainIdx) will contain FLT_MAX, if trainIdx is one from k best,
// otherwise it will contain distance between queryIdx and trainIdx descriptors
void
knnMatch
(
const
GpuMat
&
queryDescs
,
const
GpuMat
&
trainDescs
,
GpuMat
&
trainIdx
,
GpuMat
&
distance
,
GpuMat
&
allDist
,
int
k
,
const
GpuMat
&
mask
=
GpuMat
(),
Stream
&
stream
=
Stream
::
Null
());
// Find k best matches for each query descriptor (in increasing order of distances)
void
knnMatchSingle
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
GpuMat
&
trainIdx
,
GpuMat
&
distance
,
GpuMat
&
allDist
,
int
k
,
const
GpuMat
&
mask
=
GpuMat
(),
Stream
&
stream
=
Stream
::
Null
());
// Download trainIdx and distance and convert it to vector with DMatch
// compactResult is used when mask is not empty. If compactResult is false matches
...
...
@@ -1296,27 +1284,40 @@ namespace cv
// compactResult is used when mask is not empty. If compactResult is false matches
// vector will have the same size as queryDescriptors rows. If compactResult is true
// matches vector will not contain matches for fully masked out query descriptors.
void
knnMatch
(
const
GpuMat
&
query
Descs
,
const
GpuMat
&
trainDescs
,
void
knnMatch
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
matches
,
int
k
,
const
GpuMat
&
mask
=
GpuMat
(),
bool
compactResult
=
false
);
// Find k best matches from train collection for each query descriptor (in increasing order of distances)
void
knnMatch2Collection
(
const
GpuMat
&
query
,
const
GpuMat
&
trainCollection
,
GpuMat
&
trainIdx
,
GpuMat
&
imgIdx
,
GpuMat
&
distance
,
const
GpuMat
&
maskCollection
=
GpuMat
(),
Stream
&
stream
=
Stream
::
Null
());
// Download trainIdx and distance and convert it to vector with DMatch
// compactResult is used when mask is not empty. If compactResult is false matches
// vector will have the same size as queryDescriptors rows. If compactResult is true
// matches vector will not contain matches for fully masked out query descriptors.
static
void
knnMatch2Download
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
imgIdx
,
const
GpuMat
&
distance
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
matches
,
bool
compactResult
=
false
);
// Convert trainIdx and distance to vector with DMatch
static
void
knnMatch2Convert
(
const
Mat
&
trainIdx
,
const
Mat
&
imgIdx
,
const
Mat
&
distance
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
matches
,
bool
compactResult
=
false
);
// Find k best matches for each query descriptor (in increasing order of distances).
// compactResult is used when mask is not empty. If compactResult is false matches
// vector will have the same size as queryDescriptors rows. If compactResult is true
// matches vector will not contain matches for fully masked out query descriptors.
void
knnMatch
(
const
GpuMat
&
query
Descs
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
matches
,
int
knn
,
const
std
::
vector
<
GpuMat
>&
masks
=
std
::
vector
<
GpuMat
>
(),
bool
compactResult
=
false
);
void
knnMatch
(
const
GpuMat
&
query
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
matches
,
int
k
,
const
std
::
vector
<
GpuMat
>&
masks
=
std
::
vector
<
GpuMat
>
(),
bool
compactResult
=
false
);
// Find best matches for each query descriptor which have distance less than maxDistance.
// nMatches.at<int>(0, queryIdx) will contain matches count for queryIdx.
// carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,
// because it didn't have enough memory.
// trainIdx.at<int>(queruIdx, i) will contain ith train index (i < min(nMatches.at<int>(0, queruIdx), trainIdx.cols))
// distance.at<int>(queruIdx, i) will contain ith distance (i < min(nMatches.at<int>(0, queruIdx), trainIdx.cols))
// If trainIdx is empty, then trainIdx and distance will be created with size nQuery x (nTrain / 2),
// If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10),
// otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
// Matches doesn't sorted.
void
radiusMatchSingle
(
const
GpuMat
&
query
Descs
,
const
GpuMat
&
trainDescs
,
void
radiusMatchSingle
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
GpuMat
&
trainIdx
,
GpuMat
&
distance
,
GpuMat
&
nMatches
,
float
maxDistance
,
const
GpuMat
&
mask
=
GpuMat
(),
Stream
&
stream
=
Stream
::
Null
());
...
...
@@ -1333,15 +1334,16 @@ namespace cv
// Find best matches for each query descriptor which have distance less than maxDistance
// in increasing order of distances).
void
radiusMatch
(
const
GpuMat
&
query
Descs
,
const
GpuMat
&
trainDescs
,
void
radiusMatch
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
matches
,
float
maxDistance
,
const
GpuMat
&
mask
=
GpuMat
(),
bool
compactResult
=
false
);
// Find best matches for each query descriptor which have distance less than maxDistance.
// If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10),
// otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
// Matches doesn't sorted.
void
radiusMatchCollection
(
const
GpuMat
&
queryDescs
,
const
GpuMat
&
trainCollection
,
GpuMat
&
trainIdx
,
GpuMat
&
imgIdx
,
GpuMat
&
distance
,
GpuMat
&
nMatches
,
float
maxDistance
,
const
GpuMat
&
maskCollection
,
Stream
&
stream
=
Stream
::
Null
());
void
radiusMatchCollection
(
const
GpuMat
&
query
,
GpuMat
&
trainIdx
,
GpuMat
&
imgIdx
,
GpuMat
&
distance
,
GpuMat
&
nMatches
,
float
maxDistance
,
const
std
::
vector
<
GpuMat
>&
masks
=
std
::
vector
<
GpuMat
>
(),
Stream
&
stream
=
Stream
::
Null
());
// Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch.
// matches will be sorted in increasing order of distances.
...
...
@@ -1356,7 +1358,7 @@ namespace cv
// Find best matches from train collection for each query descriptor which have distance less than
// maxDistance (in increasing order of distances).
void
radiusMatch
(
const
GpuMat
&
query
Descs
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
matches
,
float
maxDistance
,
void
radiusMatch
(
const
GpuMat
&
query
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
matches
,
float
maxDistance
,
const
std
::
vector
<
GpuMat
>&
masks
=
std
::
vector
<
GpuMat
>
(),
bool
compactResult
=
false
);
DistType
distType
;
...
...
modules/gpu/perf/perf_features2d.cpp
View file @
d3c4e907
#include "perf_precomp.hpp"
PERF_TEST_P
(
DevInfo_DescSize
,
BruteForceMatcher_match
,
testing
::
Combine
(
testing
::
ValuesIn
(
devices
()),
testing
::
Values
(
64
,
128
)))
testing
::
Values
(
64
,
128
,
256
)))
{
DeviceInfo
devInfo
=
std
::
tr1
::
get
<
0
>
(
GetParam
());
int
desc_size
=
std
::
tr1
::
get
<
1
>
(
GetParam
());
...
...
@@ -19,7 +19,7 @@ PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_match, testing::Combine(testing:
BruteForceMatcher_GPU
<
L2
<
float
>
>
matcher
;
declare
.
time
(
0.5
).
iterations
(
10
0
);
declare
.
time
(
3.
0
);
SIMPLE_TEST_CYCLE
()
{
...
...
@@ -35,7 +35,7 @@ PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_match, testing::Combine(testing:
PERF_TEST_P
(
DevInfo_K_DescSize
,
BruteForceMatcher_knnMatch
,
testing
::
Combine
(
testing
::
ValuesIn
(
devices
()),
testing
::
Values
(
2
,
3
),
testing
::
Values
(
64
,
128
)))
testing
::
Values
(
64
,
128
,
256
)))
{
DeviceInfo
devInfo
=
std
::
tr1
::
get
<
0
>
(
GetParam
());
int
k
=
std
::
tr1
::
get
<
1
>
(
GetParam
());
...
...
@@ -54,11 +54,11 @@ PERF_TEST_P(DevInfo_K_DescSize, BruteForceMatcher_knnMatch, testing::Combine(tes
BruteForceMatcher_GPU
<
L2
<
float
>
>
matcher
;
declare
.
time
(
0.5
).
iterations
(
10
0
);
declare
.
time
(
3.
0
);
SIMPLE_TEST_CYCLE
()
{
matcher
.
knnMatch
(
query
,
train
,
trainIdx
,
distance
,
allDist
,
k
);
matcher
.
knnMatch
Single
(
query
,
train
,
trainIdx
,
distance
,
allDist
,
k
);
}
Mat
trainIdx_host
(
trainIdx
);
...
...
@@ -69,7 +69,7 @@ PERF_TEST_P(DevInfo_K_DescSize, BruteForceMatcher_knnMatch, testing::Combine(tes
}
PERF_TEST_P
(
DevInfo_DescSize
,
BruteForceMatcher_radiusMatch
,
testing
::
Combine
(
testing
::
ValuesIn
(
devices
(
SHARED_ATOMICS
)),
testing
::
Values
(
64
,
128
)))
testing
::
Values
(
64
,
128
,
256
)))
{
DeviceInfo
devInfo
=
std
::
tr1
::
get
<
0
>
(
GetParam
());
int
desc_size
=
std
::
tr1
::
get
<
1
>
(
GetParam
());
...
...
@@ -85,7 +85,7 @@ PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_radiusMatch, testing::Combine(te
BruteForceMatcher_GPU
<
L2
<
float
>
>
matcher
;
declare
.
time
(
0.5
).
iterations
(
10
0
);
declare
.
time
(
3.
0
);
SIMPLE_TEST_CYCLE
()
{
...
...
modules/gpu/src/brute_force_matcher.cpp
View file @
d3c4e907
...
...
@@ -56,86 +56,101 @@ bool cv::gpu::BruteForceMatcher_GPU_base::empty() const { throw_nogpu(); return
bool
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
isMaskSupported
()
const
{
throw_nogpu
();
return
true
;
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchSingle
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
const
GpuMat
&
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchDownload
(
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
DMatch
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchConvert
(
const
Mat
&
,
const
Mat
&
,
std
::
vector
<
DMatch
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchConvert
(
const
Mat
&
,
const
Mat
&
,
vector
<
DMatch
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
match
(
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
DMatch
>&
,
const
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
makeGpuCollection
(
GpuMat
&
,
GpuMat
&
,
const
vector
<
GpuMat
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchCollection
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
const
GpuMat
&
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchDownload
(
const
GpuMat
&
,
const
GpuMat
&
,
const
GpuMat
&
,
std
::
vector
<
DMatch
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchConvert
(
const
Mat
&
,
const
Mat
&
,
const
Mat
&
,
std
::
vector
<
DMatch
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
match
(
const
GpuMat
&
,
std
::
vector
<
DMatch
>&
,
const
std
::
vector
<
GpuMat
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
int
,
const
GpuMat
&
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatchDownload
(
const
GpuMat
&
,
const
GpuMat
&
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatchConvert
(
const
Mat
&
,
const
Mat
&
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch
(
const
GpuMat
&
,
const
GpuMat
&
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
,
int
,
const
GpuMat
&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch
(
const
GpuMat
&
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
,
int
,
const
std
::
vector
<
GpuMat
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchDownload
(
const
GpuMat
&
,
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
DMatch
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchConvert
(
const
Mat
&
,
const
Mat
&
,
const
Mat
&
,
vector
<
DMatch
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
match
(
const
GpuMat
&
,
vector
<
DMatch
>&
,
const
vector
<
GpuMat
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatchSingle
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
int
,
const
GpuMat
&
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatchDownload
(
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatchConvert
(
const
Mat
&
,
const
Mat
&
,
vector
<
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch
(
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
vector
<
DMatch
>
>&
,
int
,
const
GpuMat
&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch2Collection
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
const
GpuMat
&
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch2Download
(
const
GpuMat
&
,
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch2Convert
(
const
Mat
&
,
const
Mat
&
,
const
Mat
&
,
vector
<
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch
(
const
GpuMat
&
,
vector
<
vector
<
DMatch
>
>&
,
int
,
const
vector
<
GpuMat
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchSingle
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
float
,
const
GpuMat
&
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchDownload
(
const
GpuMat
&
,
const
GpuMat
&
,
const
GpuMat
&
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchConvert
(
const
Mat
&
,
const
Mat
&
,
const
Mat
&
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatch
(
const
GpuMat
&
,
const
GpuMat
&
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
,
float
,
const
GpuMat
&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchCollection
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
float
,
const
GpuMat
&
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchDownload
(
const
GpuMat
&
,
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchConvert
(
const
Mat
&
,
const
Mat
&
,
const
Mat
&
,
vector
<
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatch
(
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
vector
<
DMatch
>
>&
,
float
,
const
GpuMat
&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchCollection
(
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
float
,
const
vector
<
GpuMat
>
&
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchDownload
(
const
GpuMat
&
,
const
GpuMat
&
,
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchConvert
(
const
Mat
&
,
const
Mat
&
,
const
Mat
&
,
const
Mat
&
,
vector
<
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatch
(
const
GpuMat
&
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
,
float
,
const
std
::
vector
<
GpuMat
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatch
(
const
GpuMat
&
,
vector
<
vector
<
DMatch
>
>&
,
float
,
const
vector
<
GpuMat
>&
,
bool
)
{
throw_nogpu
();
}
#else
/* !defined (HAVE_CUDA) */
namespace
cv
{
namespace
gpu
{
namespace
bf_match
{
template
<
typename
T
>
void
match
Single
L1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
template
<
typename
T
>
void
matchL1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
const
DevMem2D
&
mask
,
const
DevMem2D
i
&
trainIdx
,
const
DevMem2Df
&
distance
,
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
match
Single
L2_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
template
<
typename
T
>
void
matchL2_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
const
DevMem2D
&
mask
,
const
DevMem2D
i
&
trainIdx
,
const
DevMem2Df
&
distance
,
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
match
Single
Hamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
template
<
typename
T
>
void
matchHamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
const
DevMem2D
&
mask
,
const
DevMem2D
i
&
trainIdx
,
const
DevMem2Df
&
distance
,
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
match
CollectionL1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trainCollection
,
const
DevMem2D_
<
PtrStep
>&
maskCollection
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
template
<
typename
T
>
void
match
L1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trains
,
const
DevMem2D_
<
PtrStep
>&
masks
,
const
DevMem2D
i
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
,
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
match
CollectionL2_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trainCollection
,
const
DevMem2D_
<
PtrStep
>&
maskCollection
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
template
<
typename
T
>
void
match
L2_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trains
,
const
DevMem2D_
<
PtrStep
>&
masks
,
const
DevMem2D
i
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
,
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
match
CollectionHamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trainCollection
,
const
DevMem2D_
<
PtrStep
>&
maskCollection
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
template
<
typename
T
>
void
match
Hamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trains
,
const
DevMem2D_
<
PtrStep
>&
masks
,
const
DevMem2D
i
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
,
int
cc
,
cudaStream_t
stream
);
}}}
namespace
cv
{
namespace
gpu
{
namespace
bf_knnmatch
{
template
<
typename
T
>
void
knnMatchL1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
int
k
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
allDist
,
template
<
typename
T
>
void
matchL1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
int
k
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2Df
&
allDist
,
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
matchL2_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
int
k
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2Df
&
allDist
,
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
matchHamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
int
k
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2Df
&
allDist
,
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
match2L1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trains
,
const
DevMem2D_
<
PtrStep
>&
masks
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
knnMatchL2_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
int
k
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
allDist
,
template
<
typename
T
>
void
match2L2_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trains
,
const
DevMem2D_
<
PtrStep
>&
masks
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
knnMatchHamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
int
k
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
allDist
,
template
<
typename
T
>
void
match2Hamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trains
,
const
DevMem2D_
<
PtrStep
>&
masks
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
int
cc
,
cudaStream_t
stream
);
}}}
namespace
cv
{
namespace
gpu
{
namespace
bf_radius_match
{
template
<
typename
T
>
void
radiusMatchSingleL1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
float
maxDistance
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
nMatches
,
cudaStream_t
stream
);
template
<
typename
T
>
void
radiusMatchSingleL2_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
float
maxDistance
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
nMatches
,
cudaStream_t
stream
);
template
<
typename
T
>
void
radiusMatchSingleHamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
float
maxDistance
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
nMatches
,
cudaStream_t
stream
);
template
<
typename
T
>
void
radiusMatchCollectionL1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trainCollection
,
float
maxDistance
,
const
DevMem2D_
<
PtrStep
>&
maskCollection
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
nMatches
,
cudaStream_t
stream
);
template
<
typename
T
>
void
radiusMatchCollectionL2_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trainCollection
,
float
maxDistance
,
const
DevMem2D_
<
PtrStep
>&
maskCollection
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
nMatches
,
cudaStream_t
stream
);
template
<
typename
T
>
void
radiusMatchCollectionHamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trainCollection
,
float
maxDistance
,
const
DevMem2D_
<
PtrStep
>&
maskCollection
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
nMatches
,
cudaStream_t
stream
);
template
<
typename
T
>
void
matchL1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
float
maxDistance
,
const
DevMem2D
&
mask
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Df
&
distance
,
const
DevMem2D_
<
unsigned
int
>&
nMatches
,
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
matchL2_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
float
maxDistance
,
const
DevMem2D
&
mask
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Df
&
distance
,
const
DevMem2D_
<
unsigned
int
>&
nMatches
,
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
matchHamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
float
maxDistance
,
const
DevMem2D
&
mask
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Df
&
distance
,
const
DevMem2D_
<
unsigned
int
>&
nMatches
,
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
matchL1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
*
trains
,
int
n
,
float
maxDistance
,
const
DevMem2D
*
masks
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
,
const
DevMem2D_
<
unsigned
int
>&
nMatches
,
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
matchL2_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
*
trains
,
int
n
,
float
maxDistance
,
const
DevMem2D
*
masks
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
,
const
DevMem2D_
<
unsigned
int
>&
nMatches
,
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
matchHamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
*
trains
,
int
n
,
float
maxDistance
,
const
DevMem2D
*
masks
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
,
const
DevMem2D_
<
unsigned
int
>&
nMatches
,
int
cc
,
cudaStream_t
stream
);
}}}
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
BruteForceMatcher_GPU_base
(
DistType
distType_
)
:
distType
(
distType_
)
...
...
@@ -173,52 +188,53 @@ bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const
////////////////////////////////////////////////////////////////////
// Match
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchSingle
(
const
GpuMat
&
queryDescs
,
const
GpuMat
&
trainDescs
,
GpuMat
&
trainIdx
,
GpuMat
&
distance
,
const
GpuMat
&
mask
,
Stream
&
stream
)
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchSingle
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
GpuMat
&
trainIdx
,
GpuMat
&
distance
,
const
GpuMat
&
mask
,
Stream
&
stream
)
{
if
(
query
Descs
.
empty
()
||
trainDescs
.
empty
())
if
(
query
.
empty
()
||
train
.
empty
())
return
;
using
namespace
cv
::
gpu
::
bf_match
;
typedef
void
(
*
match_
caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
int
cc
,
cudaStream_t
stream
);
typedef
void
(
*
caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
const
DevMem2D
&
mask
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Df
&
distance
,
int
cc
,
cudaStream_t
stream
);
static
const
match_caller_t
match_callers
[
3
][
8
]
=
static
const
caller_t
callers
[
3
][
6
]
=
{
{
match
SingleL1_gpu
<
unsigned
char
>
,
0
/*matchSingle
L1_gpu<signed char>*/
,
match
SingleL1_gpu
<
unsigned
short
>
,
matchSingle
L1_gpu
<
short
>
,
match
SingleL1_gpu
<
int
>
,
matchSingleL1_gpu
<
float
>
,
0
,
0
match
L1_gpu
<
unsigned
char
>
,
0
/*match
L1_gpu<signed char>*/
,
match
L1_gpu
<
unsigned
short
>
,
match
L1_gpu
<
short
>
,
match
L1_gpu
<
int
>
,
matchL1_gpu
<
float
>
},
{
0
/*match
SingleL2_gpu<unsigned char>*/
,
0
/*matchSingle
L2_gpu<signed char>*/
,
0
/*match
SingleL2_gpu<unsigned short>*/
,
0
/*matchSingle
L2_gpu<short>*/
,
0
/*match
SingleL2_gpu<int>*/
,
matchSingleL2_gpu
<
float
>
,
0
,
0
0
/*match
L2_gpu<unsigned char>*/
,
0
/*match
L2_gpu<signed char>*/
,
0
/*match
L2_gpu<unsigned short>*/
,
0
/*match
L2_gpu<short>*/
,
0
/*match
L2_gpu<int>*/
,
matchL2_gpu
<
float
>
},
{
match
SingleHamming_gpu
<
unsigned
char
>
,
0
/*matchSingle
Hamming_gpu<signed char>*/
,
match
SingleHamming_gpu
<
unsigned
short
>
,
0
/*matchSingle
Hamming_gpu<short>*/
,
match
SingleHamming_gpu
<
int
>
,
0
,
0
,
0
match
Hamming_gpu
<
unsigned
char
>
,
0
/*match
Hamming_gpu<signed char>*/
,
match
Hamming_gpu
<
unsigned
short
>
,
0
/*match
Hamming_gpu<short>*/
,
match
Hamming_gpu
<
int
>
,
0
/*matchHamming_gpu<float>*/
}
};
CV_Assert
(
query
Descs
.
channels
()
==
1
&&
queryDescs
.
depth
()
<
CV_64F
);
CV_Assert
(
train
Descs
.
cols
==
queryDescs
.
cols
&&
trainDescs
.
type
()
==
queryDescs
.
type
());
CV_Assert
(
query
.
channels
()
==
1
&&
query
.
depth
()
<
CV_64F
);
CV_Assert
(
train
.
cols
==
query
.
cols
&&
train
.
type
()
==
query
.
type
());
const
int
nQuery
=
query
Descs
.
rows
;
const
int
nQuery
=
query
.
rows
;
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32S
,
trainIdx
);
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32F
,
distance
);
match_caller_t
func
=
match_callers
[
distType
][
queryDescs
.
depth
()];
caller_t
func
=
callers
[
distType
][
query
.
depth
()];
CV_Assert
(
func
!=
0
);
DeviceInfo
info
;
int
cc
=
info
.
majorVersion
()
*
10
+
info
.
minorVersion
();
func
(
query
Descs
,
trainDescs
,
mask
,
trainIdx
,
distance
,
cc
,
StreamAccessor
::
getStream
(
stream
));
func
(
query
,
train
,
mask
,
trainIdx
,
distance
,
cc
,
StreamAccessor
::
getStream
(
stream
));
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchDownload
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
distance
,
vector
<
DMatch
>&
matches
)
...
...
@@ -232,13 +248,13 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx,
matchConvert
(
trainIdxCPU
,
distanceCPU
,
matches
);
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchConvert
(
const
Mat
&
trainIdx
,
const
Mat
&
distance
,
std
::
vector
<
DMatch
>&
matches
)
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchConvert
(
const
Mat
&
trainIdx
,
const
Mat
&
distance
,
vector
<
DMatch
>&
matches
)
{
if
(
trainIdx
.
empty
()
||
distance
.
empty
())
return
;
CV_Assert
(
trainIdx
.
type
()
==
CV_32SC1
&&
trainIdx
.
isContinuous
()
);
CV_Assert
(
distance
.
type
()
==
CV_32FC1
&&
distance
.
isContinuous
()
&&
distance
.
cols
==
trainIdx
.
cols
);
CV_Assert
(
trainIdx
.
type
()
==
CV_32SC1
);
CV_Assert
(
distance
.
type
()
==
CV_32FC1
&&
distance
.
cols
==
trainIdx
.
cols
);
const
int
nQuery
=
trainIdx
.
cols
;
...
...
@@ -250,6 +266,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons
for
(
int
queryIdx
=
0
;
queryIdx
<
nQuery
;
++
queryIdx
,
++
trainIdx_ptr
,
++
distance_ptr
)
{
int
trainIdx
=
*
trainIdx_ptr
;
if
(
trainIdx
==
-
1
)
continue
;
...
...
@@ -261,11 +278,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons
}
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
match
(
const
GpuMat
&
query
Descs
,
const
GpuMat
&
trainDescs
,
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
match
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
vector
<
DMatch
>&
matches
,
const
GpuMat
&
mask
)
{
GpuMat
trainIdx
,
distance
;
matchSingle
(
query
Descs
,
trainDescs
,
trainIdx
,
distance
,
mask
);
matchSingle
(
query
,
train
,
trainIdx
,
distance
,
mask
);
matchDownload
(
trainIdx
,
distance
,
matches
);
}
...
...
@@ -279,14 +296,13 @@ void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat& trainCollect
{
Mat
trainCollectionCPU
(
1
,
static_cast
<
int
>
(
trainDescCollection
.
size
()),
CV_8UC
(
sizeof
(
DevMem2D
)));
for
(
size_t
i
=
0
;
i
<
trainDescCollection
.
size
();
++
i
)
{
const
GpuMat
&
trainDescs
=
trainDescCollection
[
i
];
DevMem2D
*
trainCollectionCPU_ptr
=
trainCollectionCPU
.
ptr
<
DevMem2D
>
();
trainCollectionCPU
.
ptr
<
DevMem2D
>
(
0
)[
i
]
=
trainDescs
;
}
for
(
size_t
i
=
0
,
size
=
trainDescCollection
.
size
();
i
<
size
;
++
i
,
++
trainCollectionCPU_ptr
)
*
trainCollectionCPU_ptr
=
trainDescCollection
[
i
];
trainCollection
.
upload
(
trainCollectionCPU
);
maskCollection
.
release
();
}
else
{
...
...
@@ -295,16 +311,18 @@ void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat& trainCollect
Mat
trainCollectionCPU
(
1
,
static_cast
<
int
>
(
trainDescCollection
.
size
()),
CV_8UC
(
sizeof
(
DevMem2D
)));
Mat
maskCollectionCPU
(
1
,
static_cast
<
int
>
(
trainDescCollection
.
size
()),
CV_8UC
(
sizeof
(
PtrStep
)));
for
(
size_t
i
=
0
;
i
<
trainDescCollection
.
size
();
++
i
)
DevMem2D
*
trainCollectionCPU_ptr
=
trainCollectionCPU
.
ptr
<
DevMem2D
>
();
PtrStep
*
maskCollectionCPU_ptr
=
maskCollectionCPU
.
ptr
<
PtrStep
>
();
for
(
size_t
i
=
0
,
size
=
trainDescCollection
.
size
();
i
<
size
;
++
i
,
++
trainCollectionCPU_ptr
,
++
maskCollectionCPU_ptr
)
{
const
GpuMat
&
train
Descs
=
trainDescCollection
[
i
];
const
GpuMat
&
train
=
trainDescCollection
[
i
];
const
GpuMat
&
mask
=
masks
[
i
];
CV_Assert
(
mask
.
empty
()
||
(
mask
.
type
()
==
CV_8UC1
&&
mask
.
cols
==
train
Descs
.
rows
));
CV_Assert
(
mask
.
empty
()
||
(
mask
.
type
()
==
CV_8UC1
&&
mask
.
cols
==
train
.
rows
));
trainCollectionCPU
.
ptr
<
DevMem2D
>
(
0
)[
i
]
=
trainDescs
;
maskCollectionCPU
.
ptr
<
PtrStep
>
(
0
)[
i
]
=
mask
;
*
trainCollectionCPU_ptr
=
train
;
*
maskCollectionCPU_ptr
=
mask
;
}
trainCollection
.
upload
(
trainCollectionCPU
);
...
...
@@ -312,52 +330,53 @@ void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat& trainCollect
}
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchCollection
(
const
GpuMat
&
queryDescs
,
const
GpuMat
&
trainCollection
,
GpuMat
&
trainIdx
,
GpuMat
&
imgIdx
,
GpuMat
&
distance
,
const
GpuMat
&
maskCollection
,
Stream
&
stream
)
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchCollection
(
const
GpuMat
&
query
,
const
GpuMat
&
trainCollection
,
GpuMat
&
trainIdx
,
GpuMat
&
imgIdx
,
GpuMat
&
distance
,
const
GpuMat
&
masks
,
Stream
&
stream
)
{
if
(
query
Descs
.
empty
()
||
trainCollection
.
empty
())
if
(
query
.
empty
()
||
trainCollection
.
empty
())
return
;
using
namespace
cv
::
gpu
::
bf_match
;
typedef
void
(
*
match_caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
trainCollection
,
const
DevMem2D_
<
PtrStep
>&
maskCollection
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
int
cc
,
cudaStream_t
stream
);
typedef
void
(
*
caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
trains
,
const
DevMem2D_
<
PtrStep
>&
masks
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
,
int
cc
,
cudaStream_t
stream
);
static
const
match_caller_t
match_callers
[
3
][
8
]
=
static
const
caller_t
callers
[
3
][
6
]
=
{
{
match
CollectionL1_gpu
<
unsigned
char
>
,
0
/*matchCollection
L1_gpu<signed char>*/
,
match
CollectionL1_gpu
<
unsigned
short
>
,
matchCollection
L1_gpu
<
short
>
,
match
CollectionL1_gpu
<
int
>
,
matchCollectionL1_gpu
<
float
>
,
0
,
0
match
L1_gpu
<
unsigned
char
>
,
0
/*match
L1_gpu<signed char>*/
,
match
L1_gpu
<
unsigned
short
>
,
match
L1_gpu
<
short
>
,
match
L1_gpu
<
int
>
,
matchL1_gpu
<
float
>
},
{
0
/*match
CollectionL2_gpu<unsigned char>*/
,
0
/*matchCollection
L2_gpu<signed char>*/
,
0
/*match
CollectionL2_gpu<unsigned short>*/
,
0
/*matchCollection
L2_gpu<short>*/
,
0
/*match
CollectionL2_gpu<int>*/
,
matchCollectionL2_gpu
<
float
>
,
0
,
0
0
/*match
L2_gpu<unsigned char>*/
,
0
/*match
L2_gpu<signed char>*/
,
0
/*match
L2_gpu<unsigned short>*/
,
0
/*match
L2_gpu<short>*/
,
0
/*match
L2_gpu<int>*/
,
matchL2_gpu
<
float
>
},
{
match
CollectionHamming_gpu
<
unsigned
char
>
,
0
/*matchCollection
Hamming_gpu<signed char>*/
,
match
CollectionHamming_gpu
<
unsigned
short
>
,
0
/*matchCollection
Hamming_gpu<short>*/
,
match
CollectionHamming_gpu
<
int
>
,
0
,
0
,
0
match
Hamming_gpu
<
unsigned
char
>
,
0
/*match
Hamming_gpu<signed char>*/
,
match
Hamming_gpu
<
unsigned
short
>
,
0
/*match
Hamming_gpu<short>*/
,
match
Hamming_gpu
<
int
>
,
0
/*matchHamming_gpu<float>*/
}
};
CV_Assert
(
query
Descs
.
channels
()
==
1
&&
queryDescs
.
depth
()
<
CV_64F
);
CV_Assert
(
query
.
channels
()
==
1
&&
query
.
depth
()
<
CV_64F
);
const
int
nQuery
=
query
Descs
.
rows
;
const
int
nQuery
=
query
.
rows
;
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32S
,
trainIdx
);
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32S
,
imgIdx
);
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32F
,
distance
);
match_caller_t
func
=
match_callers
[
distType
][
queryDescs
.
depth
()];
caller_t
func
=
callers
[
distType
][
query
.
depth
()];
CV_Assert
(
func
!=
0
);
DeviceInfo
info
;
int
cc
=
info
.
majorVersion
()
*
10
+
info
.
minorVersion
();
func
(
query
Descs
,
trainCollection
,
maskCollection
,
trainIdx
,
imgIdx
,
distance
,
cc
,
StreamAccessor
::
getStream
(
stream
));
func
(
query
,
trainCollection
,
masks
,
trainIdx
,
imgIdx
,
distance
,
cc
,
StreamAccessor
::
getStream
(
stream
));
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchDownload
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
imgIdx
,
const
GpuMat
&
distance
,
vector
<
DMatch
>&
matches
)
...
...
@@ -377,9 +396,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons
if
(
trainIdx
.
empty
()
||
imgIdx
.
empty
()
||
distance
.
empty
())
return
;
CV_Assert
(
trainIdx
.
type
()
==
CV_32SC1
&&
trainIdx
.
isContinuous
()
);
CV_Assert
(
imgIdx
.
type
()
==
CV_32SC1
&&
imgIdx
.
isContinuous
()
&&
imgIdx
.
cols
==
trainIdx
.
cols
);
CV_Assert
(
distance
.
type
()
==
CV_32FC1
&&
distance
.
isContinuous
()
&&
imgIdx
.
cols
==
trainIdx
.
cols
);
CV_Assert
(
trainIdx
.
type
()
==
CV_32SC1
);
CV_Assert
(
imgIdx
.
type
()
==
CV_32SC1
&&
imgIdx
.
cols
==
trainIdx
.
cols
);
CV_Assert
(
distance
.
type
()
==
CV_32FC1
&&
distance
.
cols
==
trainIdx
.
cols
);
const
int
nQuery
=
trainIdx
.
cols
;
...
...
@@ -392,6 +411,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons
for
(
int
queryIdx
=
0
;
queryIdx
<
nQuery
;
++
queryIdx
,
++
trainIdx_ptr
,
++
imgIdx_ptr
,
++
distance_ptr
)
{
int
trainIdx
=
*
trainIdx_ptr
;
if
(
trainIdx
==
-
1
)
continue
;
...
...
@@ -405,7 +425,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons
}
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
match
(
const
GpuMat
&
query
Descs
,
vector
<
DMatch
>&
matches
,
const
vector
<
GpuMat
>&
masks
)
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
match
(
const
GpuMat
&
query
,
vector
<
DMatch
>&
matches
,
const
vector
<
GpuMat
>&
masks
)
{
GpuMat
trainCollection
;
GpuMat
maskCollection
;
...
...
@@ -414,46 +434,50 @@ void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& queryDescs, vector
GpuMat
trainIdx
,
imgIdx
,
distance
;
matchCollection
(
query
Descs
,
trainCollection
,
trainIdx
,
imgIdx
,
distance
,
maskCollection
);
matchCollection
(
query
,
trainCollection
,
trainIdx
,
imgIdx
,
distance
,
maskCollection
);
matchDownload
(
trainIdx
,
imgIdx
,
distance
,
matches
);
}
////////////////////////////////////////////////////////////////////
// KnnMatch
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch
(
const
GpuMat
&
queryDescs
,
const
GpuMat
&
trainDescs
,
GpuMat
&
trainIdx
,
GpuMat
&
distance
,
GpuMat
&
allDist
,
int
k
,
const
GpuMat
&
mask
,
Stream
&
stream
)
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatchSingle
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
GpuMat
&
trainIdx
,
GpuMat
&
distance
,
GpuMat
&
allDist
,
int
k
,
const
GpuMat
&
mask
,
Stream
&
stream
)
{
if
(
query
Descs
.
empty
()
||
trainDescs
.
empty
())
if
(
query
.
empty
()
||
train
.
empty
())
return
;
using
namespace
cv
::
gpu
::
bf_knnmatch
;
typedef
void
(
*
match_
caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
int
k
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
allDist
,
int
cc
,
cudaStream_t
stream
);
typedef
void
(
*
caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
int
k
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2Df
&
allDist
,
int
cc
,
cudaStream_t
stream
);
static
const
match_caller_t
match_callers
[
3
][
8
]
=
static
const
caller_t
callers
[
3
][
6
]
=
{
{
knnMatchL1_gpu
<
unsigned
char
>
,
0
/*knnMatchL1_gpu<signed char>*/
,
knnMatchL1_gpu
<
unsigned
short
>
,
knnMatchL1_gpu
<
short
>
,
knnMatchL1_gpu
<
int
>
,
knnMatchL1_gpu
<
float
>
,
0
,
0
matchL1_gpu
<
unsigned
char
>
,
0
/*matchL1_gpu<signed char>*/
,
matchL1_gpu
<
unsigned
short
>
,
matchL1_gpu
<
short
>
,
matchL1_gpu
<
int
>
,
matchL1_gpu
<
float
>
},
{
0
/*knnMatchL2_gpu<unsigned char>*/
,
0
/*knnMatchL2_gpu<signed char>*/
,
0
/*knnMatchL2_gpu<unsigned short>*/
,
0
/*knnMatchL2_gpu<short>*/
,
0
/*knnMatchL2_gpu<int>*/
,
knnMatchL2_gpu
<
float
>
,
0
,
0
0
/*matchL2_gpu<unsigned char>*/
,
0
/*matchL2_gpu<signed char>*/
,
0
/*matchL2_gpu<unsigned short>*/
,
0
/*matchL2_gpu<short>*/
,
0
/*matchL2_gpu<int>*/
,
matchL2_gpu
<
float
>
},
{
knnMatchHamming_gpu
<
unsigned
char
>
,
0
/*knnMatchHamming_gpu<signed char>*/
,
knnMatchHamming_gpu
<
unsigned
short
>
,
0
/*knnMatchHamming_gpu<short>*/
,
knnMatchHamming_gpu
<
int
>
,
0
,
0
,
0
matchHamming_gpu
<
unsigned
char
>
,
0
/*matchHamming_gpu<signed char>*/
,
matchHamming_gpu
<
unsigned
short
>
,
0
/*matchHamming_gpu<short>*/
,
matchHamming_gpu
<
int
>
,
0
/*matchHamming_gpu<float>*/
}
};
CV_Assert
(
query
Descs
.
channels
()
==
1
&&
queryDescs
.
depth
()
<
CV_64F
);
CV_Assert
(
train
Descs
.
type
()
==
queryDescs
.
type
()
&&
trainDescs
.
cols
==
queryDescs
.
cols
);
CV_Assert
(
query
.
channels
()
==
1
&&
query
.
depth
()
<
CV_64F
);
CV_Assert
(
train
.
type
()
==
query
.
type
()
&&
train
.
cols
==
query
.
cols
);
const
int
nQuery
=
query
Descs
.
rows
;
const
int
nTrain
=
train
Descs
.
rows
;
const
int
nQuery
=
query
.
rows
;
const
int
nTrain
=
train
.
rows
;
if
(
k
==
2
)
{
...
...
@@ -468,25 +492,17 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
}
if
(
stream
)
{
stream
.
enqueueMemSet
(
trainIdx
,
Scalar
::
all
(
-
1
));
if
(
k
!=
2
)
stream
.
enqueueMemSet
(
allDist
,
Scalar
::
all
(
numeric_limits
<
float
>::
max
()));
}
else
{
trainIdx
.
setTo
(
Scalar
::
all
(
-
1
));
if
(
k
!=
2
)
allDist
.
setTo
(
Scalar
::
all
(
numeric_limits
<
float
>::
max
()));
}
match_caller_t
func
=
match_callers
[
distType
][
queryDescs
.
depth
()];
caller_t
func
=
callers
[
distType
][
query
.
depth
()];
CV_Assert
(
func
!=
0
);
DeviceInfo
info
;
int
cc
=
info
.
majorVersion
()
*
10
+
info
.
minorVersion
();
func
(
query
Descs
,
trainDescs
,
k
,
mask
,
trainIdx
,
distance
,
allDist
,
cc
,
StreamAccessor
::
getStream
(
stream
));
func
(
query
,
train
,
k
,
mask
,
trainIdx
,
distance
,
allDist
,
cc
,
StreamAccessor
::
getStream
(
stream
));
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatchDownload
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
distance
,
...
...
@@ -502,7 +518,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainId
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatchConvert
(
const
Mat
&
trainIdx
,
const
Mat
&
distance
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
matches
,
bool
compactResult
)
vector
<
vector
<
DMatch
>
>&
matches
,
bool
compactResult
)
{
if
(
trainIdx
.
empty
()
||
distance
.
empty
())
return
;
...
...
@@ -546,14 +562,127 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchConvert(const Mat& trainIdx, c
}
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch
(
const
GpuMat
&
query
Descs
,
const
GpuMat
&
trainDescs
,
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
vector
<
vector
<
DMatch
>
>&
matches
,
int
k
,
const
GpuMat
&
mask
,
bool
compactResult
)
{
GpuMat
trainIdx
,
distance
,
allDist
;
knnMatch
(
queryDescs
,
trainDescs
,
trainIdx
,
distance
,
allDist
,
k
,
mask
);
knnMatch
Single
(
query
,
train
,
trainIdx
,
distance
,
allDist
,
k
,
mask
);
knnMatchDownload
(
trainIdx
,
distance
,
matches
,
compactResult
);
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch2Collection
(
const
GpuMat
&
query
,
const
GpuMat
&
trainCollection
,
GpuMat
&
trainIdx
,
GpuMat
&
imgIdx
,
GpuMat
&
distance
,
const
GpuMat
&
maskCollection
,
Stream
&
stream
)
{
if
(
query
.
empty
()
||
trainCollection
.
empty
())
return
;
using
namespace
cv
::
gpu
::
bf_knnmatch
;
typedef
void
(
*
caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
trains
,
const
DevMem2D_
<
PtrStep
>&
masks
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
int
cc
,
cudaStream_t
stream
);
static
const
caller_t
callers
[
3
][
6
]
=
{
{
match2L1_gpu
<
unsigned
char
>
,
0
/*match2L1_gpu<signed char>*/
,
match2L1_gpu
<
unsigned
short
>
,
match2L1_gpu
<
short
>
,
match2L1_gpu
<
int
>
,
match2L1_gpu
<
float
>
},
{
0
/*match2L2_gpu<unsigned char>*/
,
0
/*match2L2_gpu<signed char>*/
,
0
/*match2L2_gpu<unsigned short>*/
,
0
/*match2L2_gpu<short>*/
,
0
/*match2L2_gpu<int>*/
,
match2L2_gpu
<
float
>
},
{
match2Hamming_gpu
<
unsigned
char
>
,
0
/*match2Hamming_gpu<signed char>*/
,
match2Hamming_gpu
<
unsigned
short
>
,
0
/*match2Hamming_gpu<short>*/
,
match2Hamming_gpu
<
int
>
,
0
/*match2Hamming_gpu<float>*/
}
};
CV_Assert
(
query
.
channels
()
==
1
&&
query
.
depth
()
<
CV_64F
);
const
int
nQuery
=
query
.
rows
;
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32SC2
,
trainIdx
);
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32SC2
,
imgIdx
);
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32FC2
,
distance
);
if
(
stream
)
stream
.
enqueueMemSet
(
trainIdx
,
Scalar
::
all
(
-
1
));
else
trainIdx
.
setTo
(
Scalar
::
all
(
-
1
));
caller_t
func
=
callers
[
distType
][
query
.
depth
()];
CV_Assert
(
func
!=
0
);
DeviceInfo
info
;
int
cc
=
info
.
majorVersion
()
*
10
+
info
.
minorVersion
();
func
(
query
,
trainCollection
,
maskCollection
,
trainIdx
,
imgIdx
,
distance
,
cc
,
StreamAccessor
::
getStream
(
stream
));
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch2Download
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
imgIdx
,
const
GpuMat
&
distance
,
vector
<
vector
<
DMatch
>
>&
matches
,
bool
compactResult
)
{
if
(
trainIdx
.
empty
()
||
imgIdx
.
empty
()
||
distance
.
empty
())
return
;
Mat
trainIdxCPU
=
trainIdx
;
Mat
imgIdxCPU
=
imgIdx
;
Mat
distanceCPU
=
distance
;
knnMatch2Convert
(
trainIdxCPU
,
imgIdxCPU
,
distanceCPU
,
matches
,
compactResult
);
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch2Convert
(
const
Mat
&
trainIdx
,
const
Mat
&
imgIdx
,
const
Mat
&
distance
,
vector
<
vector
<
DMatch
>
>&
matches
,
bool
compactResult
)
{
if
(
trainIdx
.
empty
()
||
imgIdx
.
empty
()
||
distance
.
empty
())
return
;
CV_Assert
(
trainIdx
.
type
()
==
CV_32SC2
);
CV_Assert
(
imgIdx
.
type
()
==
CV_32SC2
&&
imgIdx
.
cols
==
trainIdx
.
cols
);
CV_Assert
(
distance
.
type
()
==
CV_32FC2
&&
distance
.
cols
==
trainIdx
.
cols
);
const
int
nQuery
=
trainIdx
.
cols
;
matches
.
clear
();
matches
.
reserve
(
nQuery
);
const
int
*
trainIdx_ptr
=
trainIdx
.
ptr
<
int
>
();
const
int
*
imgIdx_ptr
=
imgIdx
.
ptr
<
int
>
();
const
float
*
distance_ptr
=
distance
.
ptr
<
float
>
();
for
(
int
queryIdx
=
0
;
queryIdx
<
nQuery
;
++
queryIdx
)
{
matches
.
push_back
(
vector
<
DMatch
>
());
vector
<
DMatch
>&
curMatches
=
matches
.
back
();
curMatches
.
reserve
(
2
);
for
(
int
i
=
0
;
i
<
2
;
++
i
,
++
trainIdx_ptr
,
++
imgIdx_ptr
,
++
distance_ptr
)
{
int
trainIdx
=
*
trainIdx_ptr
;
if
(
trainIdx
!=
-
1
)
{
int
imgIdx
=
*
imgIdx_ptr
;
float
distance
=
*
distance_ptr
;
DMatch
m
(
queryIdx
,
trainIdx
,
imgIdx
,
distance
);
curMatches
.
push_back
(
m
);
}
}
if
(
compactResult
&&
curMatches
.
empty
())
matches
.
pop_back
();
}
}
namespace
{
struct
ImgIdxSetter
...
...
@@ -564,103 +693,123 @@ namespace
};
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch
(
const
GpuMat
&
query
Descs
,
vector
<
vector
<
DMatch
>
>&
matches
,
int
knn
,
const
vector
<
GpuMat
>&
masks
,
bool
compactResult
)
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch
(
const
GpuMat
&
query
,
vector
<
vector
<
DMatch
>
>&
matches
,
int
k
,
const
vector
<
GpuMat
>&
masks
,
bool
compactResult
)
{
if
(
queryDescs
.
empty
()
||
empty
())
return
;
if
(
k
==
2
)
{
GpuMat
trainCollection
;
GpuMat
maskCollection
;
vector
<
vector
<
DMatch
>
>
curMatches
;
vector
<
DMatch
>
temp
;
temp
.
reserve
(
2
*
knn
);
makeGpuCollection
(
trainCollection
,
maskCollection
,
masks
);
matches
.
resize
(
queryDescs
.
rows
);
for_each
(
matches
.
begin
(),
matches
.
end
(),
bind2nd
(
mem_fun_ref
(
&
vector
<
DMatch
>::
reserve
),
knn
));
GpuMat
trainIdx
,
imgIdx
,
distance
;
for
(
size_t
imgIdx
=
0
;
imgIdx
<
trainDescCollection
.
size
();
++
imgIdx
)
knnMatch2Collection
(
query
,
trainCollection
,
trainIdx
,
imgIdx
,
distance
,
maskCollection
);
knnMatch2Download
(
trainIdx
,
imgIdx
,
distance
,
matches
);
}
else
{
knnMatch
(
queryDescs
,
trainDescCollection
[
imgIdx
],
curMatches
,
knn
,
masks
.
empty
()
?
GpuMat
()
:
masks
[
imgIdx
]);
if
(
query
.
empty
()
||
empty
())
return
;
vector
<
vector
<
DMatch
>
>
curMatches
;
vector
<
DMatch
>
temp
;
temp
.
reserve
(
2
*
k
);
matches
.
resize
(
query
.
rows
);
for_each
(
matches
.
begin
(),
matches
.
end
(),
bind2nd
(
mem_fun_ref
(
&
vector
<
DMatch
>::
reserve
),
k
));
for
(
int
queryIdx
=
0
;
queryIdx
<
queryDescs
.
rows
;
++
query
Idx
)
for
(
size_t
imgIdx
=
0
,
size
=
trainDescCollection
.
size
();
imgIdx
<
size
;
++
img
Idx
)
{
vector
<
DMatch
>&
localMatch
=
curMatches
[
queryIdx
];
vector
<
DMatch
>&
globalMatch
=
matches
[
queryIdx
];
knnMatch
(
query
,
trainDescCollection
[
imgIdx
],
curMatches
,
k
,
masks
.
empty
()
?
GpuMat
()
:
masks
[
imgIdx
]);
for_each
(
localMatch
.
begin
(),
localMatch
.
end
(),
ImgIdxSetter
(
static_cast
<
int
>
(
imgIdx
)));
for
(
int
queryIdx
=
0
;
queryIdx
<
query
.
rows
;
++
queryIdx
)
{
vector
<
DMatch
>&
localMatch
=
curMatches
[
queryIdx
];
vector
<
DMatch
>&
globalMatch
=
matches
[
queryIdx
];
for_each
(
localMatch
.
begin
(),
localMatch
.
end
(),
ImgIdxSetter
(
static_cast
<
int
>
(
imgIdx
)));
temp
.
clear
();
merge
(
globalMatch
.
begin
(),
globalMatch
.
end
(),
localMatch
.
begin
(),
localMatch
.
end
(),
back_inserter
(
temp
));
temp
.
clear
();
merge
(
globalMatch
.
begin
(),
globalMatch
.
end
(),
localMatch
.
begin
(),
localMatch
.
end
(),
back_inserter
(
temp
));
globalMatch
.
clear
();
const
size_t
count
=
std
::
min
((
size_t
)
knn
,
temp
.
size
());
copy
(
temp
.
begin
(),
temp
.
begin
()
+
count
,
back_inserter
(
globalMatch
));
globalMatch
.
clear
();
const
size_t
count
=
std
::
min
((
size_t
)
k
,
temp
.
size
());
copy
(
temp
.
begin
(),
temp
.
begin
()
+
count
,
back_inserter
(
globalMatch
));
}
}
}
if
(
compactResult
)
{
vector
<
vector
<
DMatch
>
>::
iterator
new_end
=
remove_if
(
matches
.
begin
(),
matches
.
end
(),
m
em_fun_ref
(
&
vector
<
DMatch
>::
empty
));
matches
.
erase
(
new_end
,
matches
.
end
());
if
(
compactResult
)
{
vector
<
vector
<
DMatch
>
>::
iterator
new_end
=
remove_if
(
matches
.
begin
(),
matches
.
end
(),
mem_fun_ref
(
&
vector
<
DMatch
>::
empty
));
m
atches
.
erase
(
new_end
,
matches
.
end
(
));
}
}
}
////////////////////////////////////////////////////////////////////
// RadiusMatch
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchSingle
(
const
GpuMat
&
queryDescs
,
const
GpuMat
&
trainDescs
,
GpuMat
&
trainIdx
,
GpuMat
&
distance
,
GpuMat
&
nMatches
,
float
maxDistance
,
const
GpuMat
&
mask
,
Stream
&
stream
)
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchSingle
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
GpuMat
&
trainIdx
,
GpuMat
&
distance
,
GpuMat
&
nMatches
,
float
maxDistance
,
const
GpuMat
&
mask
,
Stream
&
stream
)
{
if
(
query
Descs
.
empty
()
||
trainDescs
.
empty
())
if
(
query
.
empty
()
||
train
.
empty
())
return
;
using
namespace
cv
::
gpu
::
bf_radius_match
;
typedef
void
(
*
radiusMatch_
caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
float
maxDistance
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
nMatches
,
cudaStream_t
stream
);
typedef
void
(
*
caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
float
maxDistance
,
const
DevMem2D
&
mask
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Df
&
distance
,
const
DevMem2D_
<
unsigned
int
>
&
nMatches
,
int
cc
,
cudaStream_t
stream
);
static
const
radiusMatch_caller_t
radiusMatch_callers
[
3
][
8
]
=
static
const
caller_t
callers
[
3
][
6
]
=
{
{
radiusMatchSingleL1_gpu
<
unsigned
char
>
,
0
/*radiusMatchSingleL1_gpu<signed char>*/
,
radiusMatchSingleL1_gpu
<
unsigned
short
>
,
radiusMatchSingleL1_gpu
<
short
>
,
radiusMatchSingleL1_gpu
<
int
>
,
radiusMatchSingleL1_gpu
<
float
>
,
0
,
0
matchL1_gpu
<
unsigned
char
>
,
0
/*matchL1_gpu<signed char>*/
,
matchL1_gpu
<
unsigned
short
>
,
matchL1_gpu
<
short
>
,
matchL1_gpu
<
int
>
,
matchL1_gpu
<
float
>
},
{
0
/*radiusMatchSingleL2_gpu<unsigned char>*/
,
0
/*radiusMatchSingleL2_gpu<signed char>*/
,
0
/*radiusMatchSingleL2_gpu<unsigned short>*/
,
0
/*radiusMatchSingleL2_gpu<short>*/
,
0
/*radiusMatchSingleL2_gpu<int>*/
,
radiusMatchSingleL2_gpu
<
float
>
,
0
,
0
0
/*matchL2_gpu<unsigned char>*/
,
0
/*matchL2_gpu<signed char>*/
,
0
/*matchL2_gpu<unsigned short>*/
,
0
/*matchL2_gpu<short>*/
,
0
/*matchL2_gpu<int>*/
,
matchL2_gpu
<
float
>
},
{
radiusMatchSingleHamming_gpu
<
unsigned
char
>
,
0
/*radiusMatchSingleHamming_gpu<signed char>*/
,
radiusMatchSingleHamming_gpu
<
unsigned
short
>
,
0
/*radiusMatchSingleHamming_gpu<short>*/
,
radiusMatchSingleHamming_gpu
<
int
>
,
0
,
0
,
0
matchHamming_gpu
<
unsigned
char
>
,
0
/*matchHamming_gpu<signed char>*/
,
matchHamming_gpu
<
unsigned
short
>
,
0
/*matchHamming_gpu<short>*/
,
matchHamming_gpu
<
int
>
,
0
/*matchHamming_gpu<float>*/
}
};
CV_Assert
(
TargetArchs
::
builtWith
(
SHARED_ATOMICS
)
&&
DeviceInfo
().
supports
(
SHARED_ATOMICS
));
DeviceInfo
info
;
int
cc
=
info
.
majorVersion
()
*
10
+
info
.
minorVersion
();
const
int
nQuery
=
queryDescs
.
rows
;
const
int
nTrain
=
trainDescs
.
rows
;
CV_Assert
(
TargetArchs
::
builtWith
(
GLOBAL_ATOMICS
)
&&
info
.
supports
(
GLOBAL_ATOMICS
));
CV_Assert
(
queryDescs
.
channels
()
==
1
&&
queryDescs
.
depth
()
<
CV_64F
);
CV_Assert
(
trainDescs
.
type
()
==
queryDescs
.
type
()
&&
trainDescs
.
cols
==
queryDescs
.
cols
);
const
int
nQuery
=
query
.
rows
;
const
int
nTrain
=
train
.
rows
;
CV_Assert
(
query
.
channels
()
==
1
&&
query
.
depth
()
<
CV_64F
);
CV_Assert
(
train
.
type
()
==
query
.
type
()
&&
train
.
cols
==
query
.
cols
);
CV_Assert
(
trainIdx
.
empty
()
||
(
trainIdx
.
rows
==
nQuery
&&
trainIdx
.
size
()
==
distance
.
size
()));
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32SC1
,
nMatches
);
if
(
trainIdx
.
empty
())
{
ensureSizeIsEnough
(
nQuery
,
nTrain
/
2
,
CV_32SC1
,
trainIdx
);
ensureSizeIsEnough
(
nQuery
,
nTrain
/
2
,
CV_32FC1
,
distance
);
ensureSizeIsEnough
(
nQuery
,
std
::
max
((
nTrain
/
100
),
10
)
,
CV_32SC1
,
trainIdx
);
ensureSizeIsEnough
(
nQuery
,
std
::
max
((
nTrain
/
100
),
10
)
,
CV_32FC1
,
distance
);
}
radiusMatch_caller_t
func
=
radiusMatch_callers
[
distType
][
queryDescs
.
depth
()];
CV_Assert
(
func
!=
0
);
caller_t
func
=
callers
[
distType
][
query
.
depth
()];
CV_Assert
(
func
!=
0
);
func
(
query
Descs
,
trainDescs
,
maxDistance
,
mask
,
trainIdx
,
distance
,
nMatches
,
StreamAccessor
::
getStream
(
stream
));
func
(
query
,
train
,
maxDistance
,
mask
,
trainIdx
,
distance
,
nMatches
,
cc
,
StreamAccessor
::
getStream
(
stream
));
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchDownload
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
distance
,
const
GpuMat
&
nMatches
,
vector
<
vector
<
DMatch
>
>&
matches
,
bool
compactResult
)
vector
<
vector
<
DMatch
>
>&
matches
,
bool
compactResult
)
{
if
(
trainIdx
.
empty
()
||
distance
.
empty
()
||
nMatches
.
empty
())
return
;
...
...
@@ -673,14 +822,14 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trai
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchConvert
(
const
Mat
&
trainIdx
,
const
Mat
&
distance
,
const
Mat
&
nMatches
,
vector
<
vector
<
DMatch
>
>&
matches
,
bool
compactResult
)
vector
<
vector
<
DMatch
>
>&
matches
,
bool
compactResult
)
{
if
(
trainIdx
.
empty
()
||
distance
.
empty
()
||
nMatches
.
empty
())
return
;
CV_Assert
(
trainIdx
.
type
()
==
CV_32SC1
);
CV_Assert
(
nMatches
.
type
()
==
CV_32SC1
&&
nMatches
.
isContinuous
()
&&
nMatches
.
cols
>=
trainIdx
.
rows
);
CV_Assert
(
distance
.
type
()
==
CV_32FC1
&&
distance
.
size
()
==
trainIdx
.
size
());
CV_Assert
(
nMatches
.
type
()
==
CV_32SC1
&&
nMatches
.
cols
==
trainIdx
.
rows
);
const
int
nQuery
=
trainIdx
.
rows
;
...
...
@@ -688,6 +837,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
matches
.
reserve
(
nQuery
);
const
int
*
nMatches_ptr
=
nMatches
.
ptr
<
int
>
();
for
(
int
queryIdx
=
0
;
queryIdx
<
nQuery
;
++
queryIdx
)
{
const
int
*
trainIdx_ptr
=
trainIdx
.
ptr
<
int
>
(
queryIdx
);
...
...
@@ -720,66 +870,75 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
}
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatch
(
const
GpuMat
&
query
Descs
,
const
GpuMat
&
trainDescs
,
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatch
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
vector
<
vector
<
DMatch
>
>&
matches
,
float
maxDistance
,
const
GpuMat
&
mask
,
bool
compactResult
)
{
GpuMat
trainIdx
,
distance
,
nMatches
;
radiusMatchSingle
(
query
Descs
,
trainDescs
,
trainIdx
,
distance
,
nMatches
,
maxDistance
,
mask
);
radiusMatchSingle
(
query
,
train
,
trainIdx
,
distance
,
nMatches
,
maxDistance
,
mask
);
radiusMatchDownload
(
trainIdx
,
distance
,
nMatches
,
matches
,
compactResult
);
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchCollection
(
const
GpuMat
&
queryDescs
,
const
GpuMat
&
trainCollection
,
GpuMat
&
trainIdx
,
GpuMat
&
imgIdx
,
GpuMat
&
distance
,
GpuMat
&
nMatches
,
float
maxDistance
,
const
GpuMat
&
maskCollection
,
Stream
&
stream
)
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchCollection
(
const
GpuMat
&
query
,
GpuMat
&
trainIdx
,
GpuMat
&
imgIdx
,
GpuMat
&
distance
,
GpuMat
&
nMatches
,
float
maxDistance
,
const
vector
<
GpuMat
>&
masks
,
Stream
&
stream
)
{
if
(
query
Descs
.
empty
()
||
trainCollection
.
empty
())
if
(
query
.
empty
()
||
empty
())
return
;
using
namespace
cv
::
gpu
::
bf_radius_match
;
typedef
void
(
*
radiusMatch_caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
trainCollection
,
float
maxDistance
,
const
DevMem2D_
<
PtrStep
>&
maskCollection
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
nMatches
,
cudaStream_t
stream
);
typedef
void
(
*
caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
*
trains
,
int
n
,
float
maxDistance
,
const
DevMem2D
*
masks
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
,
const
DevMem2D_
<
unsigned
int
>
&
nMatches
,
int
cc
,
cudaStream_t
stream
);
static
const
radiusMatch_caller_t
radiusMatch_callers
[
3
][
8
]
=
static
const
caller_t
callers
[
3
][
6
]
=
{
{
radiusMatchCollectionL1_gpu
<
unsigned
char
>
,
0
/*radiusMatchCollectionL1_gpu<signed char>*/
,
radiusMatchCollectionL1_gpu
<
unsigned
short
>
,
radiusMatchCollectionL1_gpu
<
short
>
,
radiusMatchCollectionL1_gpu
<
int
>
,
radiusMatchCollectionL1_gpu
<
float
>
,
0
,
0
matchL1_gpu
<
unsigned
char
>
,
0
/*matchL1_gpu<signed char>*/
,
matchL1_gpu
<
unsigned
short
>
,
matchL1_gpu
<
short
>
,
matchL1_gpu
<
int
>
,
matchL1_gpu
<
float
>
},
{
0
/*radiusMatchCollectionL2_gpu<unsigned char>*/
,
0
/*radiusMatchCollectionL2_gpu<signed char>*/
,
0
/*radiusMatchCollectionL2_gpu<unsigned short>*/
,
0
/*radiusMatchCollectionL2_gpu<short>*/
,
0
/*radiusMatchCollectionL2_gpu<int>*/
,
radiusMatchCollectionL2_gpu
<
float
>
,
0
,
0
0
/*matchL2_gpu<unsigned char>*/
,
0
/*matchL2_gpu<signed char>*/
,
0
/*matchL2_gpu<unsigned short>*/
,
0
/*matchL2_gpu<short>*/
,
0
/*matchL2_gpu<int>*/
,
matchL2_gpu
<
float
>
},
{
radiusMatchCollectionHamming_gpu
<
unsigned
char
>
,
0
/*radiusMatchCollectionHamming_gpu<signed char>*/
,
radiusMatchCollectionHamming_gpu
<
unsigned
short
>
,
0
/*radiusMatchCollectionHamming_gpu<short>*/
,
radiusMatchCollectionHamming_gpu
<
int
>
,
0
,
0
,
0
matchHamming_gpu
<
unsigned
char
>
,
0
/*matchHamming_gpu<signed char>*/
,
matchHamming_gpu
<
unsigned
short
>
,
0
/*matchHamming_gpu<short>*/
,
matchHamming_gpu
<
int
>
,
0
/*matchHamming_gpu<float>*/
}
};
CV_Assert
(
TargetArchs
::
builtWith
(
SHARED_ATOMICS
)
&&
DeviceInfo
().
supports
(
SHARED_ATOMICS
));
DeviceInfo
info
;
int
cc
=
info
.
majorVersion
()
*
10
+
info
.
minorVersion
();
const
int
nQuery
=
queryDescs
.
rows
;
CV_Assert
(
TargetArchs
::
builtWith
(
GLOBAL_ATOMICS
)
&&
info
.
supports
(
GLOBAL_ATOMICS
))
;
CV_Assert
(
queryDescs
.
channels
()
==
1
&&
queryDescs
.
depth
()
<
CV_64F
);
const
int
nQuery
=
query
.
rows
;
CV_Assert
(
query
.
channels
()
==
1
&&
query
.
depth
()
<
CV_64F
);
CV_Assert
(
trainIdx
.
empty
()
||
(
trainIdx
.
rows
==
nQuery
&&
trainIdx
.
size
()
==
distance
.
size
()
&&
trainIdx
.
size
()
==
imgIdx
.
size
()));
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32SC1
,
nMatches
);
if
(
trainIdx
.
empty
())
{
ensureSizeIsEnough
(
nQuery
,
nQuery
/
2
,
CV_32SC1
,
trainIdx
);
ensureSizeIsEnough
(
nQuery
,
nQuery
/
2
,
CV_32SC1
,
imgIdx
);
ensureSizeIsEnough
(
nQuery
,
nQuery
/
2
,
CV_32FC1
,
distance
);
ensureSizeIsEnough
(
nQuery
,
std
::
max
((
nQuery
/
100
),
10
)
,
CV_32SC1
,
trainIdx
);
ensureSizeIsEnough
(
nQuery
,
std
::
max
((
nQuery
/
100
),
10
)
,
CV_32SC1
,
imgIdx
);
ensureSizeIsEnough
(
nQuery
,
std
::
max
((
nQuery
/
100
),
10
)
,
CV_32FC1
,
distance
);
}
radiusMatch_caller_t
func
=
radiusMatch_callers
[
distType
][
queryDescs
.
depth
()];
caller_t
func
=
callers
[
distType
][
query
.
depth
()];
CV_Assert
(
func
!=
0
);
func
(
queryDescs
,
trainCollection
,
maxDistance
,
maskCollection
,
trainIdx
,
imgIdx
,
distance
,
nMatches
,
StreamAccessor
::
getStream
(
stream
));
vector
<
DevMem2D
>
trains_
(
trainDescCollection
.
begin
(),
trainDescCollection
.
end
());
vector
<
DevMem2D
>
masks_
(
masks
.
begin
(),
masks
.
end
());
func
(
query
,
&
trains_
[
0
],
static_cast
<
int
>
(
trains_
.
size
()),
maxDistance
,
masks_
.
size
()
==
0
?
0
:
&
masks_
[
0
],
trainIdx
,
imgIdx
,
distance
,
nMatches
,
cc
,
StreamAccessor
::
getStream
(
stream
));
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchDownload
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
imgIdx
,
const
GpuMat
&
distance
,
const
GpuMat
&
nMatches
,
vector
<
vector
<
DMatch
>
>&
matches
,
bool
compactResult
)
vector
<
vector
<
DMatch
>
>&
matches
,
bool
compactResult
)
{
if
(
trainIdx
.
empty
()
||
imgIdx
.
empty
()
||
distance
.
empty
()
||
nMatches
.
empty
())
return
;
...
...
@@ -801,7 +960,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
CV_Assert
(
trainIdx
.
type
()
==
CV_32SC1
);
CV_Assert
(
imgIdx
.
type
()
==
CV_32SC1
&&
imgIdx
.
size
()
==
trainIdx
.
size
());
CV_Assert
(
distance
.
type
()
==
CV_32FC1
&&
distance
.
size
()
==
trainIdx
.
size
());
CV_Assert
(
nMatches
.
type
()
==
CV_32SC1
&&
nMatches
.
isContinuous
()
&&
nMatches
.
cols
>
=
trainIdx
.
rows
);
CV_Assert
(
nMatches
.
type
()
==
CV_32SC1
&&
nMatches
.
cols
=
=
trainIdx
.
rows
);
const
int
nQuery
=
trainIdx
.
rows
;
...
...
@@ -809,6 +968,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
matches
.
reserve
(
nQuery
);
const
int
*
nMatches_ptr
=
nMatches
.
ptr
<
int
>
();
for
(
int
queryIdx
=
0
;
queryIdx
<
nQuery
;
++
queryIdx
)
{
const
int
*
trainIdx_ptr
=
trainIdx
.
ptr
<
int
>
(
queryIdx
);
...
...
@@ -843,18 +1003,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
}
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatch
(
const
GpuMat
&
query
Descs
,
vector
<
vector
<
DMatch
>
>&
matches
,
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatch
(
const
GpuMat
&
query
,
vector
<
vector
<
DMatch
>
>&
matches
,
float
maxDistance
,
const
vector
<
GpuMat
>&
masks
,
bool
compactResult
)
{
GpuMat
trainCollection
;
GpuMat
maskCollection
;
makeGpuCollection
(
trainCollection
,
maskCollection
,
masks
);
GpuMat
trainIdx
,
imgIdx
,
distance
,
nMatches
;
radiusMatchCollection
(
queryDescs
,
trainCollection
,
trainIdx
,
imgIdx
,
distance
,
nMatches
,
maxDistance
,
maskCollection
);
radiusMatchCollection
(
query
,
trainIdx
,
imgIdx
,
distance
,
nMatches
,
maxDistance
,
masks
);
radiusMatchDownload
(
trainIdx
,
imgIdx
,
distance
,
nMatches
,
matches
,
compactResult
);
}
...
...
modules/gpu/src/cuda/bf_knnmatch.cu
View file @
d3c4e907
...
...
@@ -49,153 +49,677 @@ using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace bf_knnmatch
{
template <typename VecDiff, typename Dist, typename T, typename Mask>
__device__ void distanceCalcLoop(const PtrStep_<T>& query, const DevMem2D_<T>& train, const Mask& m, int queryIdx,
typename Dist::result_type& distMin1, typename Dist::result_type& distMin2, int& bestTrainIdx1, int& bestTrainIdx2,
typename Dist::result_type* smem)
///////////////////////////////////////////////////////////////////////////////
// Reduction
template <int BLOCK_SIZE>
__device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
int& bestTrainIdx1, int& bestTrainIdx2,
float* s_distance, int* s_trainIdx)
{
const VecDiff vecDiff(query.ptr(queryIdx), train.cols, (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x);
typename Dist::result_type* sdiffRow = smem + blockDim.x * threadIdx.y;
distMin1 = numeric_limits<typename Dist::result_type>::max();
distMin2 = numeric_limits<typename Dist::result_type>::max();
float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1;
s_distance += threadIdx.y * BLOCK_SIZE;
s_trainIdx += threadIdx.y * BLOCK_SIZE;
s_distance[threadIdx.x] = bestDistance1;
s_trainIdx[threadIdx.x] = bestTrainIdx1;
__syncthreads();
if (threadIdx.x == 0)
{
#pragma unroll
for (int i = 0; i < BLOCK_SIZE; ++i)
{
float val = s_distance[i];
if (val < myBestDistance1)
{
myBestDistance2 = myBestDistance1;
myBestTrainIdx2 = myBestTrainIdx1;
myBestDistance1 = val;
myBestTrainIdx1 = s_trainIdx[i];
}
else if (val < myBestDistance2)
{
myBestDistance2 = val;
myBestTrainIdx2 = s_trainIdx[i];
}
}
}
__syncthreads();
bestTrainIdx1 = -1
;
bestTrainIdx2 = -1
;
s_distance[threadIdx.x] = bestDistance2
;
s_trainIdx[threadIdx.x] = bestTrainIdx2
;
for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y)
__syncthreads();
if (threadIdx.x == 0)
{
if (m(queryIdx, trainIdx))
#pragma unroll
for (int i = 0; i < BLOCK_SIZE; ++i)
{
float val = s_distance[i];
if (val < myBestDistance2)
{
myBestDistance2 = val;
myBestTrainIdx2 = s_trainIdx[i];
}
}
}
bestDistance1 = myBestDistance1;
bestDistance2 = myBestDistance2;
bestTrainIdx1 = myBestTrainIdx1;
bestTrainIdx2 = myBestTrainIdx2;
}
template <int BLOCK_SIZE>
__device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
int& bestTrainIdx1, int& bestTrainIdx2,
int& bestImgIdx1, int& bestImgIdx2,
float* s_distance, int* s_trainIdx, int* s_imgIdx)
{
float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1;
int myBestImgIdx1 = -1;
int myBestImgIdx2 = -1;
s_distance += threadIdx.y * BLOCK_SIZE;
s_trainIdx += threadIdx.y * BLOCK_SIZE;
s_imgIdx += threadIdx.y * BLOCK_SIZE;
s_distance[threadIdx.x] = bestDistance1;
s_trainIdx[threadIdx.x] = bestTrainIdx1;
s_imgIdx[threadIdx.x] = bestImgIdx1;
__syncthreads();
if (threadIdx.x == 0)
{
#pragma unroll
for (int i = 0; i < BLOCK_SIZE; ++i)
{
Dist dist;
float val = s_distance[i];
if (val < myBestDistance1)
{
myBestDistance2 = myBestDistance1;
myBestTrainIdx2 = myBestTrainIdx1;
myBestImgIdx2 = myBestImgIdx1;
myBestDistance1 = val;
myBestTrainIdx1 = s_trainIdx[i];
myBestImgIdx1 = s_imgIdx[i];
}
else if (val < myBestDistance2)
{
myBestDistance2 = val;
myBestTrainIdx2 = s_trainIdx[i];
myBestImgIdx2 = s_imgIdx[i];
}
}
}
__syncthreads();
s_distance[threadIdx.x] = bestDistance2;
s_trainIdx[threadIdx.x] = bestTrainIdx2;
s_imgIdx[threadIdx.x] = bestImgIdx2;
__syncthreads();
if (threadIdx.x == 0)
{
#pragma unroll
for (int i = 0; i < BLOCK_SIZE; ++i)
{
float val = s_distance[i];
if (val < myBestDistance2)
{
myBestDistance2 = val;
myBestTrainIdx2 = s_trainIdx[i];
myBestImgIdx2 = s_imgIdx[i];
}
}
}
bestDistance1 = myBestDistance1;
bestDistance2 = myBestDistance2;
bestTrainIdx1 = myBestTrainIdx1;
bestTrainIdx2 = myBestTrainIdx2;
bestImgIdx1 = myBestImgIdx1;
bestImgIdx2 = myBestImgIdx2;
}
///////////////////////////////////////////////////////////////////////////////
// Match Unrolled Cached
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
__device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)
{
#pragma unroll
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(min(queryIdx, query.rows - 1))[loadX] : 0;
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
typename Dist::value_type* s_query, typename Dist::value_type* s_train,
float& bestDistance1, float& bestDistance2,
int& bestTrainIdx1, int& bestTrainIdx2,
int& bestImgIdx1, int& bestImgIdx2)
{
for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
{
Dist dist;
#pragma unroll
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = loadX < train.cols ? train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX] : 0;
__syncthreads();
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
}
const T* trainRow = train.ptr(trainIdx);
vecDiff.calc(trainRow, train.cols, dist, sdiffRow, threadIdx.x);
typename Dist::result_type distVal = dist;
const typename Dist::result_type val = dist
;
const int trainIdx = t * BLOCK_SIZE + threadIdx.x
;
if (val < distMin1)
if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
{
if (distVal < bestDistance1)
{
distMin1 = val;
bestImgIdx2 = bestImgIdx1;
bestDistance2 = bestDistance1;
bestTrainIdx2 = bestTrainIdx1;
bestImgIdx1 = imgIdx;
bestDistance1 = distVal;
bestTrainIdx1 = trainIdx;
}
else if (
val < distMin
2)
else if (
distVal < bestDistance
2)
{
distMin2 = val;
bestImgIdx2 = imgIdx;
bestDistance2 = distVal;
bestTrainIdx2 = trainIdx;
}
}
}
}
template <int BLOCK_
DIM_X, int BLOCK_DIM_Y, typename VecDiff, typename Dist, typename T, typename Mask>
__global__ void
knnMatch2(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask m, int2* trainIdx, float2* d
istance)
template <int BLOCK_
SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void
matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestD
istance)
{
typedef typename Dist::result_type result_type;
typedef typename Dist::value_type value_type;
extern __shared__ int smem[];
__shared__ result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y]
;
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y
;
const int queryIdx = blockIdx.x;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
result_type distMin1;
result_type distMin2;
loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
int bestTrainIdx1;
int bestTrainIdx2;
float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1;
loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
distanceCalcLoop<VecDiff, Dist>(query, train, m, queryIdx, distMin1, distMin2, bestTrainIdx1, bestTrainIdx2, smem);
__syncthreads();
volatile result_type* sdistMinRow = smem
;
volatile int* sbestTrainIdxRow = (int*)(sdistMinRow + 2 * BLOCK_DIM_Y
);
float* s_distance = (float*)(smem)
;
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE
);
if (threadIdx.x == 0)
findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
sdistMinRow[threadIdx.y] = distMin1;
sdistMinRow[threadIdx.y + BLOCK_DIM_Y] = distMin2;
bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1;
int myBestImgIdx1 = -1;
int myBestImgIdx2 = -1;
sbestTrainIdxRow[threadIdx.y] = bestTrainIdx1;
sbestTrainIdxRow[threadIdx.y + BLOCK_DIM_Y] = bestTrainIdx2;
Mask m = mask;
for (int imgIdx = 0; imgIdx < n; ++imgIdx)
{
const DevMem2D_<T> train = trains[imgIdx];
m.next();
loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
}
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0)
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
distMin1 = numeric_limits<result_type>::max();
distMin2 = numeric_limits<result_type>::max();
bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
bestTrainIdx1 = -1;
bestTrainIdx2 = -1;
const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
// Match Unrolled
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
typename Dist::value_type* s_query, typename Dist::value_type* s_train,
float& bestDistance1, float& bestDistance2,
int& bestTrainIdx1, int& bestTrainIdx2,
int& bestImgIdx1, int& bestImgIdx2)
{
for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
{
Dist dist;
#pragma unroll
for (int i = 0; i <
BLOCK_DIM_Y
; ++i)
for (int i = 0; i <
MAX_DESC_LEN / BLOCK_SIZE
; ++i)
{
result_type val = sdistMinRow[i]
;
const int loadX = threadIdx.x + i * BLOCK_SIZE
;
if (
val < distMin1
)
if (
loadX < query.cols
)
{
distMin1 = val
;
bestTrainIdx1 = sbestTrainIdxRow[i
];
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX]
;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX
];
}
else
if (val < distMin2)
{
distMin2 = val
;
bestTrainIdx2 = sbestTrainIdxRow[i]
;
else
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0
;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0
;
}
__syncthreads();
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
}
#pragma unroll
for (int i = BLOCK_DIM_Y; i < 2 * BLOCK_DIM_Y; ++i)
typename Dist::result_type distVal = dist;
const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
{
result_type val = sdistMinRow[i];
if (distVal < bestDistance1)
{
bestImgIdx2 = bestImgIdx1;
bestDistance2 = bestDistance1;
bestTrainIdx2 = bestTrainIdx1;
if (val < distMin2)
bestImgIdx1 = imgIdx;
bestDistance1 = distVal;
bestTrainIdx1 = trainIdx;
}
else if (distVal < bestDistance2)
{
distMin2 = val;
bestTrainIdx2 = sbestTrainIdxRow[i];
bestImgIdx2 = imgIdx;
bestDistance2 = distVal;
bestTrainIdx2 = trainIdx;
}
}
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1;
loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
__syncthreads();
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1;
int myBestImgIdx1 = -1;
int myBestImgIdx2 = -1;
Mask m = mask;
for (int imgIdx = 0; imgIdx < n; ++imgIdx)
{
const DevMem2D_<T> train = trains[imgIdx];
m.next();
loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
}
__syncthreads();
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
trainIdx[queryIdx] = make_int2(bestTrainIdx1, bestTrainIdx2);
distance[queryIdx] = make_float2(distMin1, distMin2);
if (queryIdx < query.rows && threadIdx.x == 0)
{
bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
// Knn 2 Match kernel caller
// Match
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
typename Dist::value_type* s_query, typename Dist::value_type* s_train,
float& bestDistance1, float& bestDistance2,
int& bestTrainIdx1, int& bestTrainIdx2,
int& bestImgIdx1, int& bestImgIdx2)
{
for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
{
Dist dist;
for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
if (loadX < query.cols)
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX];
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
}
else
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
}
__syncthreads();
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
}
typename Dist::result_type distVal = dist;
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
void knnMatch2Simple_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream)
const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
{
if (distVal < bestDistance1)
{
bestImgIdx2 = bestImgIdx1;
bestDistance2 = bestDistance1;
bestTrainIdx2 = bestTrainIdx1;
bestImgIdx1 = imgIdx;
bestDistance1 = distVal;
bestTrainIdx1 = trainIdx;
}
else if (distVal < bestDistance2)
{
bestImgIdx2 = imgIdx;
bestDistance2 = distVal;
bestTrainIdx2 = trainIdx;
}
}
}
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
{
const dim3 grid(query.rows, 1, 1);
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T>
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx, distance);
float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1;
loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
__syncthreads();
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
}
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Mask>
void knnMatch2Cached_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream)
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
{
StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check(); // block size must be greter than descriptors length
StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check(); // max descriptors length must divide to blockDimX
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
const dim3 grid(query.rows, 1, 1
);
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1
);
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem
);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE
);
knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T>
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, distance.data);
float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1;
int myBestImgIdx1 = -1;
int myBestImgIdx2 = -1;
Mask m = mask;
for (int imgIdx = 0; imgIdx < n; ++imgIdx)
{
const DevMem2D_<T> train = trains[imgIdx];
m.next();
loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
}
__syncthreads();
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
}
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
...
...
@@ -203,142 +727,254 @@ namespace cv { namespace gpu { namespace bf_knnmatch
}
///////////////////////////////////////////////////////////////////////////////
//
Knn 2 Match D
ispatcher
template <typename Dist, typename T, typename Mask>
void
knnM
atch2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2D& trainIdx, const DevMem2D& distance,
int cc, cudaStream_t stream)
//
knnMatch 2 d
ispatcher
template <typename Dist, typename T, typename Mask>
void
m
atch2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2D& trainIdx, const DevMem2D& distance,
int cc, cudaStream_t stream)
{
if (query.cols < 64)
if (query.cols <
=
64)
{
knnMatch2Cached_caller<16, 16, 64, false, Dist>(
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
}
else if (query.cols
== 64
)
else if (query.cols
<= 128
)
{
knnMatch2Cached_caller<16, 16, 64, true, Dist>(
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
}
else if (query.cols <
128
)
else if (query.cols <
= 256
)
{
knnMatch2Cached_caller<16, 16, 128, false, Dist>(
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
}
else if (query.cols == 128 && cc >= 12)
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
}
else
{
match<16, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
}
}
template <typename Dist, typename T, typename Mask>
void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
int cc, cudaStream_t stream)
{
if (query.cols <= 64)
{
knnMatch2Cached_caller<16, 16, 128, true, Dist>(
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
}
else if (query.cols <
256 && cc >= 12
)
else if (query.cols <
= 128
)
{
knnMatch2Cached_caller<16, 16, 256, false, Dist>(
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
}
else if (query.cols
== 256 && cc >= 12
)
else if (query.cols
<= 256
)
{
knnMatch2Cached_caller<16, 16, 256, true, Dist>(
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
}
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
}
else
{
knnMatch2Simple_caller<16, 16, Dist>(
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
match<16, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
}
}
///////////////////////////////////////////////////////////////////////////////
// Calc distance kernel
template <int BLOCK_
DIM_X, int BLOCK_DIM_Y
, typename Dist, typename T, typename Mask>
__global__ void calcDistance
(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf distance
)
template <int BLOCK_
SIZE, int MAX_DESC_LEN
, typename Dist, typename T, typename Mask>
__global__ void calcDistance
Unrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist
)
{
__shared__ typename Dist::result_type sdiff[BLOCK_DIM_X * BLOCK_DIM_Y
];
extern __shared__ int smem[
];
typename Dist::result_type* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;
const int queryIdx = blockIdx.x;
const T* queryDescs = query.ptr(queryIdx);
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
if (trainIdx < train.rows)
Dist dist;
#pragma unroll
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
{
const T* trainDescs = train.ptr(trainIdx);
const int loadX = threadIdx.x + i * BLOCK_SIZE;
if (loadX < query.cols)
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX];
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
}
else
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
}
typename Dist::result_type myDist = numeric_limits<typename Dist::result_type>::max();
__syncthreads();
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
}
if (queryIdx < query.rows && trainIdx < train.rows)
{
float distVal = numeric_limits<float>::max();
if (mask(queryIdx, trainIdx))
{
Dist dist;
distVal = (typename Dist::result_type)dist;
allDist.ptr(queryIdx)[trainIdx] = distVal;
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void calcDistanceUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
calcDistanceUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__global__ void calcDistance(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
calcVecDiffGlobal<BLOCK_DIM_X>(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x)
;
Dist dist
;
myDist = dist;
for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
if (loadX < query.cols)
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX];
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
}
if (threadIdx.x == 0)
distance.ptr(queryIdx)[trainIdx] = myDist;
else
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
}
__syncthreads();
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
}
}
///////////////////////////////////////////////////////////////////////////////
// Calc distance kernel caller
if (queryIdx < query.rows && trainIdx < train.rows)
{
float distVal = numeric_limits<float>::max();
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
void calcDistance_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& distance, cudaStream_t stream)
if (mask(queryIdx, trainIdx))
distVal = (typename Dist::result_type)dist;
allDist.ptr(queryIdx)[trainIdx] = distVal;
}
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
void calcDistance(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)
{
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1);
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
calcDistance<BLOCK_
DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(query, train, mask, distance
);
calcDistance<BLOCK_
SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist
);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename Dist, typename T, typename Mask>
void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2D& allDist, cudaStream_t stream)
///////////////////////////////////////////////////////////////////////////////
// Calc Distance dispatcher
template <typename Dist, typename T, typename Mask>
void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2Df& allDist,
int cc, cudaStream_t stream)
{
calcDistance_caller<16, 16, Dist>(query, train, mask, static_cast<DevMem2Df>(allDist), stream);
if (query.cols <= 64)
{
calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
}
else if (query.cols <= 128)
{
calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream);
}
else if (query.cols <= 256)
{
calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);
}
else if (query.cols <= 512)
{
calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);
}
else if (query.cols <= 1024)
{
calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);
}
else
{
calcDistance<16, Dist>(query, train, mask, allDist, stream);
}
}
///////////////////////////////////////////////////////////////////////////////
// find knn match kernel
template <int BLOCK_SIZE> __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_)
template <int BLOCK_SIZE>
__global__ void findBestMatch(DevMem2Df allDist, int i, PtrStepi trainIdx, PtrStepf distance)
{
const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
__shared__ float sdist[SMEM_SIZE];
__shared__ int strainIdx[SMEM_SIZE];
__shared__ float s
_
dist[SMEM_SIZE];
__shared__ int s
_
trainIdx[SMEM_SIZE];
const int queryIdx = blockIdx.x;
float* allDist = allDist_.ptr(queryIdx);
int* trainIdx = trainIdx_.ptr(queryIdx);
float* distance = distance_.ptr(queryIdx);
float* allDistRow = allDist.ptr(queryIdx);
float dist = numeric_limits<float>::max();
int bestIdx = -1;
for (int i = threadIdx.x; i < allDist
_
.cols; i += BLOCK_SIZE)
for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE)
{
float reg = allDist[i];
float reg = allDist
Row
[i];
if (reg < dist)
{
dist = reg;
...
...
@@ -346,34 +982,32 @@ namespace cv { namespace gpu { namespace bf_knnmatch
}
}
sdist[threadIdx.x] = dist;
strainIdx[threadIdx.x] = bestIdx;
s
_
dist[threadIdx.x] = dist;
s
_
trainIdx[threadIdx.x] = bestIdx;
__syncthreads();
reducePredVal<BLOCK_SIZE>(s
dist, dist, s
trainIdx, bestIdx, threadIdx.x, less<volatile float>());
reducePredVal<BLOCK_SIZE>(s
_dist, dist, s_
trainIdx, bestIdx, threadIdx.x, less<volatile float>());
if (threadIdx.x == 0)
{
if (dist < numeric_limits<float>::max())
{
allDist[bestIdx] = numeric_limits<float>::max();
trainIdx[i] = bestIdx;
distance[i] = dist;
allDist
Row
[bestIdx] = numeric_limits<float>::max();
trainIdx
.ptr(queryIdx)
[i] = bestIdx;
distance
.ptr(queryIdx)
[i] = dist;
}
}
}
///////////////////////////////////////////////////////////////////////////////
// find knn match kernel caller
template <int BLOCK_SIZE> void findKnnMatch_caller(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
template <int BLOCK_SIZE>
void findKnnMatch(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
{
const dim3
threads
(BLOCK_SIZE, 1, 1);
const dim3
block
(BLOCK_SIZE, 1, 1);
const dim3 grid(trainIdx.rows, 1, 1);
for (int i = 0; i < k; ++i)
{
findBestMatch<BLOCK_SIZE><<<grid,
threads
, 0, stream>>>(allDist, i, trainIdx, distance);
findBestMatch<BLOCK_SIZE><<<grid,
block
, 0, stream>>>(allDist, i, trainIdx, distance);
cudaSafeCall( cudaGetLastError() );
}
...
...
@@ -381,84 +1015,130 @@ namespace cv { namespace gpu { namespace bf_knnmatch
cudaSafeCall( cudaDeviceSynchronize() );
}
void findKnnMatchDispatcher(int k, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist
, cudaStream_t stream)
void findKnnMatchDispatcher(int k, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
f& allDist, int cc
, cudaStream_t stream)
{
findKnnMatch
_caller<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), static_cast<DevMem2Df>(allDist)
, stream);
findKnnMatch
<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), allDist
, stream);
}
///////////////////////////////////////////////////////////////////////////////
// knn match Dispatcher
template <typename Dist, typename T>
void
knnMatchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const DevMem2D
& mask,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
template <typename Dist, typename T
, typename Mask
>
void
matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const Mask
& mask,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
f
& allDist,
int cc, cudaStream_t stream)
{
if (
mask.data
)
if (
k == 2
)
{
if (k == 2)
{
knnMatch2Dispatcher<Dist>(query, train, SingleMask(mask), trainIdx, distance, cc, stream);
return;
}
calcDistanceDispatcher<Dist>(query, train, SingleMask(mask), allDist, stream);
match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, cc, stream);
}
else
{
if (k == 2)
{
knnMatch2Dispatcher<Dist>(query, train, WithOutMask(), trainIdx, distance, cc, stream);
return;
}
calcDistanceDispatcher<Dist>(query, train, WithOutMask(), allDist, stream);
calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream);
findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream);
}
findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream);
}
}
///////////////////////////////////////////////////////////////////////////////
// knn match caller
template <typename T> void
knnM
atchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
template <typename T> void
m
atchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
f
& allDist,
int cc, cudaStream_t stream)
{
knnMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
if (mask.data)
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
else
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
}
template void
knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
//template void
knnMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
template void
knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
template void
knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
template void
knnMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
template void
knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
template void
matchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
//template void
matchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
template void
matchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
template void
matchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
template void
matchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
template void
matchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
template <typename T> void
knnM
atchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
template <typename T> void
m
atchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
f
& allDist,
int cc, cudaStream_t stream)
{
knnMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
if (mask.data)
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
else
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
}
//template void
knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
//template void
knnMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
//template void
knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
//template void
knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
//template void
knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
template void
knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
//template void
matchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
//template void
matchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
//template void
matchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
//template void
matchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
//template void
matchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
template void
matchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
template <typename T> void
knnM
atchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
template <typename T> void
m
atchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
f
& allDist,
int cc, cudaStream_t stream)
{
knnMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
if (mask.data)
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
else
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
}
template void knnMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
//template void knnMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void knnMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
//template void knnMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void knnMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void matchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
//template void matchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
template void matchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
template <typename T> void match2L1_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks,
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
int cc, cudaStream_t stream)
{
if (masks.data)
match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
else
match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
}
template void match2L1_gpu<uchar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//template void match2L1_gpu<schar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void match2L1_gpu<ushort>(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void match2L1_gpu<short >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void match2L1_gpu<int >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void match2L1_gpu<float >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template <typename T> void match2L2_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks,
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
int cc, cudaStream_t stream)
{
if (masks.data)
match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
else
match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
}
//template void match2L2_gpu<uchar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//template void match2L2_gpu<schar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//template void match2L2_gpu<ushort>(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//template void match2L2_gpu<short >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//template void match2L2_gpu<int >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2Di& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void match2L2_gpu<float >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template <typename T> void match2Hamming_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks,
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
int cc, cudaStream_t stream)
{
if (masks.data)
match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
else
match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
}
template void match2Hamming_gpu<uchar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//template void match2Hamming_gpu<schar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void match2Hamming_gpu<ushort>(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//template void match2Hamming_gpu<short >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void match2Hamming_gpu<int >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
}}}
modules/gpu/src/cuda/bf_match.cu
View file @
d3c4e907
...
...
@@ -49,355 +49,715 @@ using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace bf_match
{
template <int BLOCK_DIM_Y, typename T>
__device__ void findBestMatch(T& myDist, int2& myIdx, T* smin, int2* sIdx)
///////////////////////////////////////////////////////////////////////////////
// Reduction
template <int BLOCK_SIZE>
__device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx)
{
if (threadIdx.x == 0)
{
smin[threadIdx.y] = myDist;
sIdx[threadIdx.y] = myIdx;
}
s_distance += threadIdx.y * BLOCK_SIZE;
s_trainIdx += threadIdx.y * BLOCK_SIZE;
s_distance[threadIdx.x] = bestDistance;
s_trainIdx[threadIdx.x] = bestTrainIdx;
__syncthreads();
reducePredVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<volatile float>());
}
template <int BLOCK_SIZE>
__device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx)
{
s_distance += threadIdx.y * BLOCK_SIZE;
s_trainIdx += threadIdx.y * BLOCK_SIZE;
s_imgIdx += threadIdx.y * BLOCK_SIZE;
s_distance[threadIdx.x] = bestDistance;
s_trainIdx[threadIdx.x] = bestTrainIdx;
s_imgIdx [threadIdx.x] = bestImgIdx;
__syncthreads();
reducePredVal
<BLOCK_DIM_Y>(smin, myDist, sIdx, myIdx, threadIdx.y * blockDim.x + threadIdx.x, less<volatile T
>());
reducePredVal
2<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less<volatile float
>());
}
template <typename Dist, typename VecDiff, typename T, typename Mask>
__device__ void matchDescs(int queryIdx, int imgIdx, const DevMem2D_<T>& train, const Mask& m, const VecDiff& vecDiff,
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row)
///////////////////////////////////////////////////////////////////////////////
// Match Unrolled Cached
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
__device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)
{
#pragma unroll
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(min(queryIdx, query.rows - 1))[loadX] : 0;
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
typename Dist::value_type* s_query, typename Dist::value_type* s_train,
float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
{
for (int t
rainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y
)
for (int t
= 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t
)
{
if (m(queryIdx, trainIdx))
Dist dist;
#pragma unroll
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
{
const
T* trainDescs = train.ptr(trainIdx)
;
const
int loadX = threadIdx.x + i * BLOCK_SIZE
;
Dist dist
;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = loadX < train.cols ? train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX] : 0
;
vecDiff.calc(trainDescs, train.cols, dist, sdiff_row, threadIdx.x
);
__syncthreads(
);
const typename Dist::result_type res = dist;
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
if (res < myDist)
{
myDist = res;
myIdx.x = trainIdx;
myIdx.y = imgIdx;
}
__syncthreads();
}
typename Dist::result_type distVal = dist;
const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
{
bestImgIdx = imgIdx;
bestDistance = distVal;
bestTrainIdx = trainIdx;
}
}
}
template <typename T> struct SingleTrain
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
{
explicit SingleTrain(const DevMem2D_<T>& train_) : train(train_)
{
}
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
float myBestDistance = numeric_limits<float>::max();
int myBestTrainIdx = -1;
loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
__syncthreads();
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
template <typename Dist, typename VecDiff, typename Mask>
__device__ __forceinline__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff,
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const
if (queryIdx < query.rows && threadIdx.x == 0)
{
matchDescs<Dist>(queryIdx, 0, train, m, vecDiff, myDist, myIdx, sdiff_row);
bestTrainIdx[queryIdx] = myBestTrainIdx;
bestDistance[queryIdx] = myBestDistance;
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask,
int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
__device__ __forceinline__ int desc_len() const
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
float myBestDistance = numeric_limits<float>::max();
int myBestTrainIdx = -1;
int myBestImgIdx = -1;
Mask m = mask;
for (int imgIdx = 0; imgIdx < n; ++imgIdx)
{
return train.cols;
const DevMem2D_<T> train = trains[imgIdx];
m.next();
loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
}
static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx,
float myDist, const int2& myIdx, int queryIdx)
__syncthreads();
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
trainIdx[queryIdx] = myIdx.x;
distance[queryIdx] = myDist;
bestTrainIdx[queryIdx] = myBestTrainIdx;
bestImgIdx[queryIdx] = myBestImgIdx;
bestDistance[queryIdx] = myBestDistance;
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
const DevMem2D_<T> train;
};
matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
// Match Unrolled
template <typename T> struct TrainCollection
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
typename Dist::value_type* s_query, typename Dist::value_type* s_train,
float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
{
TrainCollection(const DevMem2D_<T>* trainCollection_, int nImg_, int desclen_) :
trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_)
for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
{
}
Dist dist;
template <typename Dist, typename VecDiff, typename Mask>
__device__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff,
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const
{
for (int imgIdx = 0; imgIdx < nImg; ++imgIdx)
#pragma unroll
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
if (loadX < query.cols)
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX];
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
}
else
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
}
__syncthreads();
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
}
typename Dist::result_type distVal = dist;
const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
{
const DevMem2D_<T> train = trainCollection[imgIdx]
;
m.next()
;
matchDescs<Dist>(queryIdx, imgIdx, train, m, vecDiff, myDist, myIdx, sdiff_row)
;
bestImgIdx = imgIdx
;
bestDistance = distVal
;
bestTrainIdx = trainIdx
;
}
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
float myBestDistance = numeric_limits<float>::max();
int myBestTrainIdx = -1;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
__syncthreads();
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
bestTrainIdx[queryIdx] = myBestTrainIdx;
bestDistance[queryIdx] = myBestDistance;
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
__device__ __forceinline__ int desc_len() const
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask,
int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
float myBestDistance = numeric_limits<float>::max();
int myBestTrainIdx = -1;
int myBestImgIdx = -1;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
Mask m = mask;
for (int imgIdx = 0; imgIdx < n; ++imgIdx)
{
return desclen;
const DevMem2D_<T> train = trains[imgIdx];
m.next();
loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
}
static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx,
float myDist, const int2& myIdx, int queryIdx)
__syncthreads();
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
trainIdx[queryIdx] = myIdx.
x;
imgIdx[queryIdx] = myIdx.y
;
distance[queryIdx] = myDist
;
bestTrainIdx[queryIdx] = myBestTrainId
x;
bestImgIdx[queryIdx] = myBestImgIdx
;
bestDistance[queryIdx] = myBestDistance
;
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
const DevMem2D_<T>* trainCollection;
const int nImg;
const int desclen;
};
///////////////////////////////////////////////////////////////////////////////
// Match
template <typename VecDiff, typename Dist, typename T, typename Train, typename Mask>
__device__ void distanceCalcLoop(const PtrStep_<T>& query, const Train& train, const Mask& mask, int queryIdx,
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* smem)
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
typename Dist::value_type* s_query, typename Dist::value_type* s_train,
float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
{
const VecDiff vecDiff(query.ptr(queryIdx), train.desc_len(), (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x);
typename Dist::result_type* sdiff_row = smem + blockDim.x * threadIdx.y
;
for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
{
Dist dist
;
Mask m = mask;
for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
if (loadX < query.cols)
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX];
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
}
else
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
}
__syncthreads();
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
}
myIdx.x = -1;
myIdx.y = -1;
myDist = numeric_limits<typename Dist::result_type>::max();
typename Dist::result_type distVal = dist;
train.template loop<Dist>(queryIdx, m, vecDiff, myDist, myIdx, sdiff_row);
const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
{
bestImgIdx = imgIdx;
bestDistance = distVal;
bestTrainIdx = trainIdx;
}
}
}
template <int BLOCK_
DIM_X, int BLOCK_DIM_Y, typename VecDiff, typename Dist, typename T, typename Train
, typename Mask>
__global__ void match(const
PtrStep_<T> query, const Train train, const Mask mask, int* trainIdx, int* imgIdx, float* d
istance)
template <int BLOCK_
SIZE, typename Dist, typename T
, typename Mask>
__global__ void match(const
DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestD
istance)
{
__shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];
const int queryIdx = blockIdx.x;
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
float myBestDistance = numeric_limits<float>::max();
int myBestTrainIdx = -1;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
int2 myIdx;
typename Dist::result_type myDist;
loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
distanceCalcLoop<VecDiff, Dist>(query, train, mask, queryIdx, myDist, myIdx, smem);
__syncthreads();
typename Dist::result_type* smin = smem
;
int
2* sIdx = (int2*)(smin + BLOCK_DIM_Y
);
float* s_distance = (float*)(smem)
;
int
* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE
);
findBestMatch<BLOCK_
DIM_Y>(myDist, myIdx, smin, s
Idx);
findBestMatch<BLOCK_
SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_train
Idx);
if (threadIdx.x == 0 && threadIdx.y == 0)
Train::storeResult(distance, trainIdx, imgIdx, myDist, myIdx, queryIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
bestTrainIdx[queryIdx] = myBestTrainIdx;
bestDistance[queryIdx] = myBestDistance;
}
}
///////////////////////////////////////////////////////////////////////////////
// Match kernel caller
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Train, typename Mask>
void matchSimple_caller(const DevMem2D_<T>& query, const Train& train, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
cudaStream_t stream)
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance,
cudaStream_t stream)
{
StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const dim3 grid(query.rows, 1, 1);
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
match<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T>
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data);
match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Train, typename Mask>
void matchCached_caller(const DevMem2D_<T>& query, const Train& train, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
cudaStream_t stream)
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask,
int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
float myBestDistance = numeric_limits<float>::max();
int myBestTrainIdx = -1;
int myBestImgIdx = -1;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
Mask m = mask;
for (int imgIdx = 0; imgIdx < n; ++imgIdx)
{
const DevMem2D_<T> train = trains[imgIdx];
m.next();
loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
}
__syncthreads();
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
bestTrainIdx[queryIdx] = myBestTrainIdx;
bestImgIdx[queryIdx] = myBestImgIdx;
bestDistance[queryIdx] = myBestDistance;
}
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
cudaStream_t stream)
{
StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp
StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check(); // block size must be greter than descriptors length
StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check(); // max descriptors length must divide to blockDimX
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const dim3 grid(query.rows, 1, 1);
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
match<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T>
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data);
match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
// Match
D
ispatcher
// Match
d
ispatcher
template <typename Dist, typename T, typename
Train, typename Mask>
void matchDispatcher(const DevMem2D_<T>& query, const
Train
& train, const Mask& mask,
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
int cc, cudaStream_t stream)
template <typename Dist, typename T, typename
Mask>
void matchDispatcher(const DevMem2D_<T>& query, const
DevMem2D_<T>
& train, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance,
int cc, cudaStream_t stream)
{
if (query.cols < 64)
if (query.cols <
=
64)
{
matchCached_caller<16, 16, 64, false, Dist>(
query, train, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
stream);
matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);
}
else if (query.cols
== 64
)
else if (query.cols
<= 128
)
{
matchCached_caller<16, 16, 64, true, Dist>(
query, train, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
stream);
matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream);
}
else if (query.cols <
128
)
else if (query.cols <
= 256
)
{
matchCached_caller<16, 16, 128, false, Dist>(
query, train, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
stream);
matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream);
}
else if (query.cols == 128 && cc >= 12)
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream);
}
else
{
matchCached_caller<16, 16, 128, true, Dist>(
query, train, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
stream);
match<16, Dist>(query, train, mask, trainIdx, distance, stream);
}
else if (query.cols < 256 && cc >= 12)
}
template <typename Dist, typename T, typename Mask>
void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
int cc, cudaStream_t stream)
{
if (query.cols <= 64)
{
matchCached_caller<16, 16, 256, false, Dist>(
query, train, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
stream);
matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
}
else if (query.cols
== 256 && cc >= 12
)
else if (query.cols
<= 128
)
{
matchCached_caller<16, 16, 256, true, Dist>(
query, train, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
stream);
matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
}
else if (query.cols <= 256)
{
matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
}
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
}
else
{
matchSimple_caller<16, 16, Dist>(
query, train, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
stream);
match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
}
}
///////////////////////////////////////////////////////////////////////////////
// Match caller
template <typename T> void match
SingleL1_gpu(const DevMem2D& query, const DevMem2D& train_
, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D
& distance,
int cc, cudaStream_t stream)
template <typename T> void match
L1_gpu(const DevMem2D& query, const DevMem2D& train
, const DevMem2D& mask,
const DevMem2Di& trainIdx, const DevMem2Df
& distance,
int cc, cudaStream_t stream)
{
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
if (mask.data)
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream);
{
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask),
trainIdx, distance,
cc, stream);
}
else
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream);
{
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(),
trainIdx, distance,
cc, stream);
}
}
template void match
SingleL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
SingleL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
SingleL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
SingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
SingleL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
SingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
L1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
L1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
L1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
L1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
L1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
L1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template <typename T> void match
SingleL2_gpu(const DevMem2D& query, const DevMem2D& train_
, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D
& distance,
int cc, cudaStream_t stream)
template <typename T> void match
L2_gpu(const DevMem2D& query, const DevMem2D& train
, const DevMem2D& mask,
const DevMem2Di& trainIdx, const DevMem2Df
& distance,
int cc, cudaStream_t stream)
{
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
if (mask.data)
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream);
{
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask),
trainIdx, distance,
cc, stream);
}
else
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream);
{
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(),
trainIdx, distance,
cc, stream);
}
}
//template void match
SingleL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
SingleL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
SingleL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
SingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
SingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
SingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
L2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
L2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
L2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
L2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
L2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
L2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template <typename T> void match
SingleHamming_gpu(const DevMem2D& query, const DevMem2D& train_
, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D
& distance,
int cc, cudaStream_t stream)
template <typename T> void match
Hamming_gpu(const DevMem2D& query, const DevMem2D& train
, const DevMem2D& mask,
const DevMem2Di& trainIdx, const DevMem2Df
& distance,
int cc, cudaStream_t stream)
{
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
if (mask.data)
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream);
{
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask),
trainIdx, distance,
cc, stream);
}
else
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream);
{
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(),
trainIdx, distance,
cc, stream);
}
}
template void match
SingleHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
SingleHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
SingleHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
SingleHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
SingleHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
Hamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
Hamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
Hamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
Hamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
Hamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template <typename T> void match
CollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection
,
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance,
int cc, cudaStream_t stream)
template <typename T> void match
L1_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks
,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance,
int cc, cudaStream_t stream)
{
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
if (maskCollection.data)
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream);
if (masks.data)
{
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
trainIdx, imgIdx, distance,
cc, stream);
}
else
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
{
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(),
trainIdx, imgIdx, distance,
cc, stream);
}
}
template void match
CollectionL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
CollectionL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
CollectionL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
CollectionL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
CollectionL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
CollectionL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
L1_gpu<uchar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
L1_gpu<schar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
L1_gpu<ushort>(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
L1_gpu<short >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
L1_gpu<int >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
L1_gpu<float >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template <typename T> void match
CollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection
,
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance,
int cc, cudaStream_t stream)
template <typename T> void match
L2_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks
,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance,
int cc, cudaStream_t stream)
{
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
if (maskCollection.data)
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream);
if (masks.data)
{
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
trainIdx, imgIdx, distance,
cc, stream);
}
else
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
{
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(),
trainIdx, imgIdx, distance,
cc, stream);
}
}
//template void match
CollectionL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
CollectionL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
CollectionL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
CollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
CollectionL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
CollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
L2_gpu<uchar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
L2_gpu<schar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
L2_gpu<ushort>(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
L2_gpu<short >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
L2_gpu<int >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
L2_gpu<float >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template <typename T> void match
CollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection
,
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance,
int cc, cudaStream_t stream)
template <typename T> void match
Hamming_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks
,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance,
int cc, cudaStream_t stream)
{
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
if (maskCollection.data)
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream);
if (masks.data)
{
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
trainIdx, imgIdx, distance,
cc, stream);
}
else
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
{
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(),
trainIdx, imgIdx, distance,
cc, stream);
}
}
template void match
CollectionHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
CollectionHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
CollectionHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
CollectionHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
CollectionHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
Hamming_gpu<uchar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
Hamming_gpu<schar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
Hamming_gpu<ushort>(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
Hamming_gpu<short >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
Hamming_gpu<int >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
}}}
modules/gpu/src/cuda/bf_radius_match.cu
View file @
d3c4e907
...
...
@@ -49,466 +49,410 @@ using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace bf_radius_match
{
template <typename T> struct SingleTrain
///////////////////////////////////////////////////////////////////////////////
// Match Unrolled
template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
__global__ void matchUnrolled(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{
enum {USE_IMG_IDX = 0};
#if __CUDA_ARCH__ >= 110
explicit SingleTrain(const DevMem2D_<T>& train_) : train(train_)
{
}
extern __shared__ int smem[];
static __device__ __forceinline__ void store(const int* s_trainIdx, const int* s_imgIdx, const float* s_dist, unsigned int& s_count, int& s_globInd,
int* trainIdx, int* imgIdx, float* distance, int maxCount)
{
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
if (tid < s_count && s_globInd + tid < maxCount)
{
trainIdx[s_globInd + tid] = s_trainIdx[tid];
distance[s_globInd + tid] = s_dist[tid];
}
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
if (tid == 0)
{
s_globInd += s_count;
s_count = 0;
}
}
Dist dist;
template <int BLOCK_STACK, typename Dist, typename VecDiff, typename Mask>
__device__ __forceinline__ void loop(float maxDistance, Mask& mask, const VecDiff& vecDiff,
int* s_trainIdx, int* s_imgIdx, float* s_dist, unsigned int& s_count, int& s_globInd,
int* trainIdxRow, int* imgIdxRow, float* distanceRow, int maxCount,
typename Dist::result_type* s_diffRow) const
#pragma unroll
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
{
#if __CUDA_ARCH__ >= 120
const int loadX = threadIdx.x + i * BLOCK_SIZE;
for (int i = 0; i < train.rows; i += blockDim.y
)
if (loadX < query.cols
)
{
int trainIdx = i + threadIdx.y;
if (trainIdx < train.rows && mask(blockIdx.x, trainIdx))
{
Dist dist;
vecDiff.calc(train.ptr(trainIdx), train.cols, dist, s_diffRow, threadIdx.x);
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX];
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
}
else
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
}
const typename Dist::result_type val = dist
;
__syncthreads()
;
if (threadIdx.x == 0 && val < maxDistance)
{
unsigned int ind = atomicInc(&s_count, (unsigned int) -1);
s_trainIdx[ind] = trainIdx;
s_dist[ind] = val;
}
}
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
__syncthreads();
}
if (s_count >= BLOCK_STACK - blockDim.y)
store(s_trainIdx, s_imgIdx, s_dist, s_count, s_globInd, trainIdxRow, imgIdxRow, distanceRow, maxCount);
float distVal = (typename Dist::result_type)dist;
__syncthreads();
if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
{
unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
if (ind < maxCount)
{
bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
bestDistance.ptr(queryIdx)[ind] = distVal;
}
}
store(s_trainIdx, s_imgIdx, s_dist, s_count, s_globInd, trainIdxRow, imgIdxRow, distanceRow, maxCount);
#endif
}
#endif
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
__device__ __forceinline__ int descLen() const
{
return train.cols;
}
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
cudaSafeCall( cudaGetLastError() );
const DevMem2D_<T> train;
};
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename T> struct TrainCollection
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T>
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2D* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
cudaStream_t stream)
{
enum {USE_IMG_IDX = 1}
;
const dim3 block(BLOCK_SIZE, BLOCK_SIZE)
;
TrainCollection(const DevMem2D_<T>* trainCollection_, int nImg_, int desclen_) :
trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_)
{
}
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
static __device__ __forceinline__ void store(const int* s_trainIdx, const int* s_imgIdx, const float* s_dist, unsigned int& s_count, int& s_globInd,
int* trainIdx, int* imgIdx, float* distance, int maxCount)
for (int i = 0; i < n; ++i)
{
const
int tid = threadIdx.y * blockDim.x + threadIdx.x
;
const
DevMem2D_<T> train = trains[i]
;
if (tid < s_count && s_globInd + tid < maxCount)
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
if (masks != 0 && masks[i].data)
{
trainIdx[s_globInd + tid] = s_trainIdx[tid];
imgIdx[s_globInd + tid] = s_imgIdx[tid];
distance[s_globInd + tid] = s_dist[tid];
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
}
if (tid == 0)
else
{
s_globInd += s_count;
s_count = 0
;
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols)
;
}
cudaSafeCall( cudaGetLastError() );
}
template <int BLOCK_STACK, typename Dist, typename VecDiff, typename Mask>
__device__ void loop(float maxDistance, Mask& mask, const VecDiff& vecDiff,
int* s_trainIdx, int* s_imgIdx, float* s_dist, unsigned int& s_count, int& s_globInd,
int* trainIdxRow, int* imgIdxRow, float* distanceRow, int maxCount,
typename Dist::result_type* s_diffRow) const
{
#if __CUDA_ARCH__ >= 120
for (int imgIdx = 0; imgIdx < nImg; ++imgIdx)
{
const DevMem2D_<T> train = trainCollection[imgIdx];
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
mask.next();
///////////////////////////////////////////////////////////////////////////////
// Match
for (int i = 0; i < train.rows; i += blockDim.y)
{
int trainIdx = i + threadIdx.y;
template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
__global__ void match(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{
#if __CUDA_ARCH__ >= 110
if (trainIdx < train.rows && mask(blockIdx.x, trainIdx))
{
Dist dist;
vecDiff.calc(train.ptr(trainIdx), desclen, dist, s_diffRow, threadIdx.x);
extern __shared__ int smem[];
const typename Dist::result_type val = dist;
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
if (threadIdx.x == 0 && val < maxDistance)
{
unsigned int ind = atomicInc(&s_count, (unsigned int) -1);
s_trainIdx[ind] = trainIdx;
s_imgIdx[ind] = imgIdx;
s_dist[ind] = val;
}
}
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
__syncthreads()
;
Dist dist
;
if (s_count >= BLOCK_STACK - blockDim.y)
store(s_trainIdx, s_imgIdx, s_dist, s_count, s_globInd, trainIdxRow, imgIdxRow, distanceRow, maxCount);
for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
__syncthreads();
}
if (loadX < query.cols)
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX];
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
}
else
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
}
store(s_trainIdx, s_imgIdx, s_dist, s_count, s_globInd, trainIdxRow, imgIdxRow, distanceRow, maxCount
);
__syncthreads(
);
#endif
}
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__device__ __forceinline__ int descLen() const
{
return desclen;
__syncthreads();
}
const DevMem2D_<T>* trainCollection;
const int nImg;
const int desclen;
};
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_STACK, typename VecDiff, typename Dist, typename T, typename Train, typename Mask>
__global__ void radiusMatch(const PtrStep_<T> query, const Train train, float maxDistance, const Mask mask,
PtrStepi trainIdx, PtrStepi imgIdx, PtrStepf distance, int* nMatches, int maxCount)
{
typedef typename Dist::result_type result_type;
typedef typename Dist::value_type value_type;
__shared__ result_type s_mem[BLOCK_DIM_X * BLOCK_DIM_Y];
float distVal = (typename Dist::result_type)dist;
__shared__ int s_trainIdx[BLOCK_STACK];
__shared__ int s_imgIdx[Train::USE_IMG_IDX ? BLOCK_STACK : 1];
__shared__ float s_dist[BLOCK_STACK];
__shared__ unsigned int s_count;
__shared__ int s_globInd;
if (threadIdx.x == 0 && threadIdx.y == 0)
if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
{
s_count = 0;
s_globInd = 0;
unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
if (ind < maxCount)
{
bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
bestDistance.ptr(queryIdx)[ind] = distVal;
}
}
__syncthreads();
const VecDiff vecDiff(query.ptr(blockIdx.x), train.descLen(), (typename Dist::value_type*)s_mem, threadIdx.y * BLOCK_DIM_X + threadIdx.x, threadIdx.x);
Mask m = mask;
train.template loop<BLOCK_STACK, Dist>(maxDistance, m, vecDiff,
s_trainIdx, s_imgIdx, s_dist, s_count, s_globInd,
trainIdx.ptr(blockIdx.x), imgIdx.ptr(blockIdx.x), distance.ptr(blockIdx.x), maxCount,
s_mem + BLOCK_DIM_X * threadIdx.y);
if (threadIdx.x == 0 && threadIdx.y == 0)
nMatches[blockIdx.x] = s_globInd;
#endif
}
///////////////////////////////////////////////////////////////////////////////
// Radius Match kernel caller
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_STACK, typename Dist, typename T, typename Train, typename Mask>
void radiusMatchSimple_caller(const DevMem2D_<T>& query, const Train& train, float maxDistance, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int* nMatches,
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
cudaStream_t stream)
{
StaticAssert<BLOCK_STACK >= BLOCK_DIM_Y>::check(
);
StaticAssert<BLOCK_STACK <= BLOCK_DIM_X * BLOCK_DIM_Y>::check(
);
const dim3 block(BLOCK_SIZE, BLOCK_SIZE
);
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)
);
const dim3 grid(query.rows, 1, 1);
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_STACK, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T>
<<<grid, threads, 0, stream>>>(query, train, maxDistance, mask, trainIdx, imgIdx, distance, nMatches
, trainIdx.cols);
match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
trainIdx, PtrStepi(), distance, nMatches.data
, trainIdx.cols);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_
DIM_X, int BLOCK_DIM_Y, int BLOCK_STACK, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Train, typename Mask>
void
radiusMatchCached_caller(const DevMem2D_<T>& query, const Train& train, float maxDistance, const Mask& mask
,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
int*
nMatches,
template <int BLOCK_
SIZE, typename Dist, typename T>
void
match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2D* masks
,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
const DevMem2D_<unsigned int>&
nMatches,
cudaStream_t stream)
{
StaticAssert<BLOCK_STACK >= BLOCK_DIM_Y>::check();
StaticAssert<BLOCK_STACK <= BLOCK_DIM_X * BLOCK_DIM_Y>::check();
StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check();
StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check();
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(query.rows, 1, 1);
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_STACK, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T>
<<<grid, threads, 0, stream>>>(query, train, maxDistance, mask, trainIdx, imgIdx, distance, nMatches, trainIdx.cols);
cudaSafeCall( cudaGetLastError() );
for (int i = 0; i < n; ++i)
{
const DevMem2D_<T> train = trains[i];
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
if (masks != 0 && masks[i].data)
{
match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
}
else
{
match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
}
cudaSafeCall( cudaGetLastError() );
}
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
//
Radius Match D
ispatcher
template <typename Dist, typename T, typename
Train, typename Mask>
void
radiusMatchDispatcher(const DevMem2D_<T>& query, const Train
& train, float maxDistance, const Mask& mask,
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D
& nMatches,
cudaStream_t stream)
//
Match d
ispatcher
template <typename Dist, typename T, typename
Mask>
void
matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>
& train, float maxDistance, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>
& nMatches,
int cc,
cudaStream_t stream)
{
if (query.cols < 64)
if (query.cols <
=
64)
{
radiusMatchCached_caller<16, 16, 64, 64, false, Dist>(
query, train, maxDistance, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), (int*)nMatches.data,
stream);
matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
else if (query.cols
== 64
)
else if (query.cols
<= 128
)
{
radiusMatchCached_caller<16, 16, 64, 64, true, Dist>(
query, train, maxDistance, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), (int*)nMatches.data,
stream);
matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
else if (query.cols <
128
)
else if (query.cols <
= 256
)
{
radiusMatchCached_caller<16, 16, 64, 128, false, Dist>(
query, train, maxDistance, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), (int*)nMatches.data,
stream);
matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
else
if (query.cols == 128)
else
{
radiusMatchCached_caller<16, 16, 64, 128, true, Dist>(
query, train, maxDistance, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), (int*)nMatches.data,
stream);
match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
else if (query.cols < 256)
}
template <typename Dist, typename T>
void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2D* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream)
{
if (query.cols <= 64)
{
matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
else if (query.cols <= 128)
{
radiusMatchCached_caller<16, 16, 64, 256, false, Dist>(
query, train, maxDistance, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), (int*)nMatches.data,
stream);
matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
else if (query.cols
=
= 256)
else if (query.cols
<
= 256)
{
radiusMatchCached_caller<16, 16, 64, 256, true, Dist>(
query, train, maxDistance, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), (int*)nMatches.data,
stream);
matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
else
{
radiusMatchSimple_caller<16, 16, 64, Dist>(
query, train, maxDistance, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), (int*)nMatches.data,
stream);
match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
}
}
///////////////////////////////////////////////////////////////////////////////
// Radius Match caller
template <typename T> void
radiusMatchSingleL1_gpu(const DevMem2D& query, const DevMem2D& train_
, float maxDistance, const DevMem2D& mask,
const DevMem2D
& trainIdx, const DevMem2D& distance, const DevMem2D
& nMatches,
cudaStream_t stream)
template <typename T> void
matchL1_gpu(const DevMem2D& query, const DevMem2D& train
, float maxDistance, const DevMem2D& mask,
const DevMem2D
i& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>
& nMatches,
int cc,
cudaStream_t stream)
{
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
if (mask.data)
{
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train
, maxDistance, SingleMask(mask),
trainIdx,
DevMem2D(),
distance, nMatches,
stream);
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train)
, maxDistance, SingleMask(mask),
trainIdx, distance, nMatches,
cc,
stream);
}
else
{
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train
, maxDistance, WithOutMask(),
trainIdx,
DevMem2D(),
distance, nMatches,
stream);
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train)
, maxDistance, WithOutMask(),
trainIdx, distance, nMatches,
cc,
stream);
}
}
template void
radiusMatchSingleL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
radiusMatchSingleL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
radiusMatchSingleL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
radiusMatchSingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
radiusMatchSingleL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
radiusMatchSingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
matchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
matchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
matchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
matchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
matchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
matchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template <typename T> void
radiusMatchSingleL2_gpu(const DevMem2D& query, const DevMem2D& train_
, float maxDistance, const DevMem2D& mask,
const DevMem2D
& trainIdx, const DevMem2D& distance, const DevMem2D
& nMatches,
cudaStream_t stream)
template <typename T> void
matchL2_gpu(const DevMem2D& query, const DevMem2D& train
, float maxDistance, const DevMem2D& mask,
const DevMem2D
i& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>
& nMatches,
int cc,
cudaStream_t stream)
{
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
if (mask.data)
{
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train
, maxDistance, SingleMask(mask),
trainIdx,
DevMem2D(),
distance, nMatches,
stream);
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train)
, maxDistance, SingleMask(mask),
trainIdx, distance, nMatches,
cc,
stream);
}
else
{
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train
, maxDistance, WithOutMask(),
trainIdx,
DevMem2D(),
distance, nMatches,
stream);
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train)
, maxDistance, WithOutMask(),
trainIdx, distance, nMatches,
cc,
stream);
}
}
//template void
radiusMatchSingleL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
radiusMatchSingleL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
radiusMatchSingleL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
radiusMatchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
radiusMatchSingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
radiusMatchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
matchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
matchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
matchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
matchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
matchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
matchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template <typename T> void
radiusMatchSingleHamming_gpu(const DevMem2D& query, const DevMem2D& train_
, float maxDistance, const DevMem2D& mask,
const DevMem2D
& trainIdx, const DevMem2D& distance, const DevMem2D
& nMatches,
cudaStream_t stream)
template <typename T> void
matchHamming_gpu(const DevMem2D& query, const DevMem2D& train
, float maxDistance, const DevMem2D& mask,
const DevMem2D
i& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>
& nMatches,
int cc,
cudaStream_t stream)
{
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
if (mask.data)
{
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train
, maxDistance, SingleMask(mask),
trainIdx,
DevMem2D(),
distance, nMatches,
stream);
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train)
, maxDistance, SingleMask(mask),
trainIdx, distance, nMatches,
cc,
stream);
}
else
{
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train
, maxDistance, WithOutMask(),
trainIdx,
DevMem2D(),
distance, nMatches,
stream);
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train)
, maxDistance, WithOutMask(),
trainIdx, distance, nMatches,
cc,
stream);
}
}
template void
radiusMatchSingleHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
radiusMatchSingleHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
radiusMatchSingleHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
radiusMatchSingleHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
radiusMatchSingleHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
matchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
matchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
matchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
matchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
matchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template <typename T> void
radiusMatchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection
,
const DevMem2D
& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D
& nMatches,
cudaStream_t stream)
template <typename T> void
matchL1_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks
,
const DevMem2D
i& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>
& nMatches,
int cc,
cudaStream_t stream)
{
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
if (maskCollection.data)
{
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, maxDistance, MaskCollection(maskCollection.data),
trainIdx, imgIdx, distance, nMatches,
stream);
}
else
{
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, maxDistance, WithOutMask(),
trainIdx, imgIdx, distance, nMatches,
stream);
}
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches,
cc, stream);
}
template void
radiusMatchCollectionL1_gpu<uchar >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
radiusMatchCollectionL1_gpu<schar >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
radiusMatchCollectionL1_gpu<ushort>(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
radiusMatchCollectionL1_gpu<short >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
radiusMatchCollectionL1_gpu<int >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
radiusMatchCollectionL1_gpu<float >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
matchL1_gpu<uchar >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
matchL1_gpu<schar >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
matchL1_gpu<ushort>(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
matchL1_gpu<short >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
matchL1_gpu<int >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
matchL1_gpu<float >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template <typename T> void
radiusMatchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection
,
const DevMem2D
& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D
& nMatches,
cudaStream_t stream)
template <typename T> void
matchL2_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks
,
const DevMem2D
i& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>
& nMatches,
int cc,
cudaStream_t stream)
{
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
if (maskCollection.data)
{
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, maxDistance, MaskCollection(maskCollection.data),
trainIdx, imgIdx, distance, nMatches,
stream);
}
else
{
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, maxDistance, WithOutMask(),
trainIdx, imgIdx, distance, nMatches,
stream);
}
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches,
cc, stream);
}
//template void
radiusMatchCollectionL2_gpu<uchar >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
radiusMatchCollectionL2_gpu<schar >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
radiusMatchCollectionL2_gpu<ushort>(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
radiusMatchCollectionL2_gpu<short >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
radiusMatchCollectionL2_gpu<int >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
radiusMatchCollectionL2_gpu<float >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
matchL2_gpu<uchar >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
matchL2_gpu<schar >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
matchL2_gpu<ushort>(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
matchL2_gpu<short >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
matchL2_gpu<int >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
matchL2_gpu<float >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template <typename T> void
radiusMatchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection
,
const DevMem2D
& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D
& nMatches,
cudaStream_t stream)
template <typename T> void
matchHamming_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks
,
const DevMem2D
i& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>
& nMatches,
int cc,
cudaStream_t stream)
{
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
if (maskCollection.data)
{
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, maxDistance, MaskCollection(maskCollection.data),
trainIdx, imgIdx, distance, nMatches,
stream);
}
else
{
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, maxDistance, WithOutMask(),
trainIdx, imgIdx, distance, nMatches,
stream);
}
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches,
cc, stream);
}
template void
radiusMatchCollectionHamming_gpu<uchar >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
radiusMatchCollectionHamming_gpu<schar >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
radiusMatchCollectionHamming_gpu<ushort>(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
radiusMatchCollectionHamming_gpu<short >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
radiusMatchCollectionHamming_gpu<int >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
matchHamming_gpu<uchar >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
matchHamming_gpu<schar >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
matchHamming_gpu<ushort>(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
matchHamming_gpu<short >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
matchHamming_gpu<int >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
}}}
modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
View file @
d3c4e907
...
...
@@ -47,6 +47,9 @@ namespace cv { namespace gpu { namespace device
{
namespace
detail
{
///////////////////////////////////////////////////////////////////////////////
// Reduction
template
<
int
n
>
struct
WarpReductor
{
template
<
typename
T
,
typename
Op
>
static
__device__
__forceinline__
void
reduce
(
volatile
T
*
data
,
T
&
partial_reduction
,
int
tid
,
const
Op
&
op
)
...
...
@@ -209,6 +212,8 @@ namespace cv { namespace gpu { namespace device
}
};
///////////////////////////////////////////////////////////////////////////////
// PredValWarpReductor
template
<
int
n
>
struct
PredValWarpReductor
;
template
<>
struct
PredValWarpReductor
<
64
>
...
...
@@ -501,6 +506,335 @@ namespace cv { namespace gpu { namespace device
}
}
};
///////////////////////////////////////////////////////////////////////////////
// PredVal2WarpReductor
template
<
int
n
>
struct
PredVal2WarpReductor
;
template
<>
struct
PredVal2WarpReductor
<
64
>
{
template
<
typename
T
,
typename
V1
,
typename
V2
,
typename
Pred
>
static
__device__
void
reduce
(
T
&
myData
,
V1
&
myVal1
,
V2
&
myVal2
,
volatile
T
*
sdata
,
V1
*
sval1
,
V2
*
sval2
,
int
tid
,
const
Pred
&
pred
)
{
if
(
tid
<
32
)
{
myData
=
sdata
[
tid
];
myVal1
=
sval1
[
tid
];
myVal2
=
sval2
[
tid
];
T
reg
=
sdata
[
tid
+
32
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
32
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
32
];
}
reg
=
sdata
[
tid
+
16
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
16
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
16
];
}
reg
=
sdata
[
tid
+
8
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
8
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
8
];
}
reg
=
sdata
[
tid
+
4
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
4
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
4
];
}
reg
=
sdata
[
tid
+
2
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
2
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
2
];
}
reg
=
sdata
[
tid
+
1
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
1
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
1
];
}
}
}
};
template
<>
struct
PredVal2WarpReductor
<
32
>
{
template
<
typename
T
,
typename
V1
,
typename
V2
,
typename
Pred
>
static
__device__
void
reduce
(
T
&
myData
,
V1
&
myVal1
,
V2
&
myVal2
,
volatile
T
*
sdata
,
V1
*
sval1
,
V2
*
sval2
,
int
tid
,
const
Pred
&
pred
)
{
if
(
tid
<
16
)
{
myData
=
sdata
[
tid
];
myVal1
=
sval1
[
tid
];
myVal2
=
sval2
[
tid
];
T
reg
=
sdata
[
tid
+
16
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
16
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
16
];
}
reg
=
sdata
[
tid
+
8
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
8
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
8
];
}
reg
=
sdata
[
tid
+
4
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
4
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
4
];
}
reg
=
sdata
[
tid
+
2
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
2
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
2
];
}
reg
=
sdata
[
tid
+
1
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
1
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
1
];
}
}
}
};
template
<>
struct
PredVal2WarpReductor
<
16
>
{
template
<
typename
T
,
typename
V1
,
typename
V2
,
typename
Pred
>
static
__device__
void
reduce
(
T
&
myData
,
V1
&
myVal1
,
V2
&
myVal2
,
volatile
T
*
sdata
,
V1
*
sval1
,
V2
*
sval2
,
int
tid
,
const
Pred
&
pred
)
{
if
(
tid
<
8
)
{
myData
=
sdata
[
tid
];
myVal1
=
sval1
[
tid
];
myVal2
=
sval2
[
tid
];
T
reg
=
reg
=
sdata
[
tid
+
8
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
8
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
8
];
}
reg
=
sdata
[
tid
+
4
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
4
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
4
];
}
reg
=
sdata
[
tid
+
2
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
2
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
2
];
}
reg
=
sdata
[
tid
+
1
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
1
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
1
];
}
}
}
};
template
<>
struct
PredVal2WarpReductor
<
8
>
{
template
<
typename
T
,
typename
V1
,
typename
V2
,
typename
Pred
>
static
__device__
void
reduce
(
T
&
myData
,
V1
&
myVal1
,
V2
&
myVal2
,
volatile
T
*
sdata
,
V1
*
sval1
,
V2
*
sval2
,
int
tid
,
const
Pred
&
pred
)
{
if
(
tid
<
4
)
{
myData
=
sdata
[
tid
];
myVal1
=
sval1
[
tid
];
myVal2
=
sval2
[
tid
];
T
reg
=
reg
=
sdata
[
tid
+
4
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
4
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
4
];
}
reg
=
sdata
[
tid
+
2
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
2
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
2
];
}
reg
=
sdata
[
tid
+
1
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
1
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
1
];
}
}
}
};
template
<
bool
warp
>
struct
PredVal2ReductionDispatcher
;
template
<>
struct
PredVal2ReductionDispatcher
<
true
>
{
template
<
int
n
,
typename
T
,
typename
V1
,
typename
V2
,
typename
Pred
>
static
__device__
void
reduce
(
T
&
myData
,
V1
&
myVal1
,
V2
&
myVal2
,
volatile
T
*
sdata
,
V1
*
sval1
,
V2
*
sval2
,
int
tid
,
const
Pred
&
pred
)
{
PredVal2WarpReductor
<
n
>::
reduce
(
myData
,
myVal1
,
myVal2
,
sdata
,
sval1
,
sval2
,
tid
,
pred
);
}
};
template
<>
struct
PredVal2ReductionDispatcher
<
false
>
{
template
<
int
n
,
typename
T
,
typename
V1
,
typename
V2
,
typename
Pred
>
static
__device__
void
reduce
(
T
&
myData
,
V1
&
myVal1
,
V2
&
myVal2
,
volatile
T
*
sdata
,
V1
*
sval1
,
V2
*
sval2
,
int
tid
,
const
Pred
&
pred
)
{
myData
=
sdata
[
tid
];
myVal1
=
sval1
[
tid
];
myVal2
=
sval2
[
tid
];
if
(
n
>=
512
&&
tid
<
256
)
{
T
reg
=
sdata
[
tid
+
256
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
256
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
256
];
}
__syncthreads
();
}
if
(
n
>=
256
&&
tid
<
128
)
{
T
reg
=
sdata
[
tid
+
128
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
128
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
128
];
}
__syncthreads
();
}
if
(
n
>=
128
&&
tid
<
64
)
{
T
reg
=
sdata
[
tid
+
64
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
64
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
64
];
}
__syncthreads
();
}
if
(
tid
<
32
)
{
if
(
n
>=
64
)
{
T
reg
=
sdata
[
tid
+
32
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
32
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
32
];
}
}
if
(
n
>=
32
)
{
T
reg
=
sdata
[
tid
+
16
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
16
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
16
];
}
}
if
(
n
>=
16
)
{
T
reg
=
sdata
[
tid
+
8
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
8
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
8
];
}
}
if
(
n
>=
8
)
{
T
reg
=
sdata
[
tid
+
4
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
4
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
4
];
}
}
if
(
n
>=
4
)
{
T
reg
=
sdata
[
tid
+
2
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
2
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
2
];
}
}
if
(
n
>=
2
)
{
T
reg
=
sdata
[
tid
+
1
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
1
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
1
];
}
}
}
}
};
}
}}}
...
...
modules/gpu/src/opencv2/gpu/device/utility.hpp
View file @
d3c4e907
...
...
@@ -121,7 +121,6 @@ namespace cv { namespace gpu { namespace device
///////////////////////////////////////////////////////////////////////////////
// Reduction
// reduction
template
<
int
n
,
typename
T
,
typename
Op
>
__device__
__forceinline__
void
reduce
(
volatile
T
*
data
,
T
&
partial_reduction
,
int
tid
,
const
Op
&
op
)
{
StaticAssert
<
n
>=
8
&&
n
<=
512
>::
check
();
...
...
@@ -134,6 +133,13 @@ namespace cv { namespace gpu { namespace device
StaticAssert
<
n
>=
8
&&
n
<=
512
>::
check
();
detail
::
PredValReductionDispatcher
<
n
<=
64
>::
reduce
<
n
>
(
myData
,
myVal
,
sdata
,
sval
,
tid
,
pred
);
}
template
<
int
n
,
typename
T
,
typename
V1
,
typename
V2
,
typename
Pred
>
__device__
__forceinline__
void
reducePredVal2
(
volatile
T
*
sdata
,
T
&
myData
,
V1
*
sval1
,
V1
&
myVal1
,
V2
*
sval2
,
V2
&
myVal2
,
int
tid
,
const
Pred
&
pred
)
{
StaticAssert
<
n
>=
8
&&
n
<=
512
>::
check
();
detail
::
PredVal2ReductionDispatcher
<
n
<=
64
>::
reduce
<
n
>
(
myData
,
myVal1
,
myVal2
,
sdata
,
sval1
,
sval2
,
tid
,
pred
);
}
///////////////////////////////////////////////////////////////////////////////
// Solve linear system
...
...
modules/stitching/src/matchers.cpp
View file @
d3c4e907
...
...
@@ -198,7 +198,7 @@ void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
// Find 1->2 matches
pair_matches
.
clear
();
matcher
.
knnMatch
(
descriptors1_
,
descriptors2_
,
train_idx_
,
distance_
,
all_dist_
,
2
);
matcher
.
knnMatch
Single
(
descriptors1_
,
descriptors2_
,
train_idx_
,
distance_
,
all_dist_
,
2
);
matcher
.
knnMatchDownload
(
train_idx_
,
distance_
,
pair_matches
);
for
(
size_t
i
=
0
;
i
<
pair_matches
.
size
();
++
i
)
{
...
...
@@ -215,7 +215,7 @@ void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
// Find 2->1 matches
pair_matches
.
clear
();
matcher
.
knnMatch
(
descriptors2_
,
descriptors1_
,
train_idx_
,
distance_
,
all_dist_
,
2
);
matcher
.
knnMatch
Single
(
descriptors2_
,
descriptors1_
,
train_idx_
,
distance_
,
all_dist_
,
2
);
matcher
.
knnMatchDownload
(
train_idx_
,
distance_
,
pair_matches
);
for
(
size_t
i
=
0
;
i
<
pair_matches
.
size
();
++
i
)
{
...
...
samples/gpu/performance/tests.cpp
View file @
d3c4e907
...
...
@@ -413,38 +413,55 @@ TEST(BruteForceMatcher)
// Output
vector
<
vector
<
DMatch
>
>
matches
(
2
);
vector
<
vector
<
DMatch
>
>
d_matches
(
2
)
;
gpu
::
GpuMat
d_trainIdx
,
d_distance
,
d_allDist
,
d_nMatches
;
SUBTEST
<<
"match"
;
matcher
.
match
(
query
,
train
,
matches
[
0
]);
CPU_ON
;
matcher
.
match
(
query
,
train
,
matches
[
0
]);
CPU_OFF
;
d_matcher
.
matchSingle
(
d_query
,
d_train
,
d_trainIdx
,
d_distance
);
GPU_ON
;
d_matcher
.
match
(
d_query
,
d_train
,
d_matches
[
0
]
);
d_matcher
.
match
Single
(
d_query
,
d_train
,
d_trainIdx
,
d_distance
);
GPU_OFF
;
SUBTEST
<<
"knnMatch"
;
int
knn
=
2
;
SUBTEST
<<
"knnMatch, 2"
;
matcher
.
knnMatch
(
query
,
train
,
matches
,
2
);
CPU_ON
;
matcher
.
knnMatch
(
query
,
train
,
matches
,
knn
);
matcher
.
knnMatch
(
query
,
train
,
matches
,
2
);
CPU_OFF
;
d_matcher
.
knnMatchSingle
(
d_query
,
d_train
,
d_trainIdx
,
d_distance
,
d_allDist
,
2
);
GPU_ON
;
d_matcher
.
knnMatch
(
d_query
,
d_train
,
d_matches
,
knn
);
d_matcher
.
knnMatchSingle
(
d_query
,
d_train
,
d_trainIdx
,
d_distance
,
d_allDist
,
2
);
GPU_OFF
;
SUBTEST
<<
"knnMatch, 3"
;
matcher
.
knnMatch
(
query
,
train
,
matches
,
3
);
CPU_ON
;
matcher
.
knnMatch
(
query
,
train
,
matches
,
3
);
CPU_OFF
;
d_matcher
.
knnMatchSingle
(
d_query
,
d_train
,
d_trainIdx
,
d_distance
,
d_allDist
,
3
);
GPU_ON
;
d_matcher
.
knnMatchSingle
(
d_query
,
d_train
,
d_trainIdx
,
d_distance
,
d_allDist
,
3
);
GPU_OFF
;
SUBTEST
<<
"radiusMatch"
;
float
max_distance
=
2.0
f
;
matcher
.
radiusMatch
(
query
,
train
,
matches
,
max_distance
);
CPU_ON
;
matcher
.
radiusMatch
(
query
,
train
,
matches
,
max_distance
);
CPU_OFF
;
d_matcher
.
radiusMatchSingle
(
d_query
,
d_train
,
d_trainIdx
,
d_distance
,
d_nMatches
,
max_distance
);
GPU_ON
;
d_matcher
.
radiusMatch
(
d_query
,
d_train
,
d_m
atches
,
max_distance
);
d_matcher
.
radiusMatch
Single
(
d_query
,
d_train
,
d_trainIdx
,
d_distance
,
d_nM
atches
,
max_distance
);
GPU_OFF
;
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment