Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
d3c4e907
Commit
d3c4e907
authored
Oct 12, 2011
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
new optimized implementation of BruteForceMatcher_GPU (~2-3x faster)
parent
89be84a3
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
2562 additions
and
1066 deletions
+2562
-1066
gpu.hpp
modules/gpu/include/opencv2/gpu/gpu.hpp
+45
-43
perf_features2d.cpp
modules/gpu/perf/perf_features2d.cpp
+7
-7
brute_force_matcher.cpp
modules/gpu/src/brute_force_matcher.cpp
+384
-231
bf_knnmatch.cu
modules/gpu/src/cuda/bf_knnmatch.cu
+902
-222
bf_match.cu
modules/gpu/src/cuda/bf_match.cu
+583
-223
bf_radius_match.cu
modules/gpu/src/cuda/bf_radius_match.cu
+274
-330
utility_detail.hpp
modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
+334
-0
utility.hpp
modules/gpu/src/opencv2/gpu/device/utility.hpp
+7
-1
matchers.cpp
modules/stitching/src/matchers.cpp
+2
-2
tests.cpp
samples/gpu/performance/tests.cpp
+24
-7
No files found.
modules/gpu/include/opencv2/gpu/gpu.hpp
View file @
d3c4e907
...
@@ -1221,26 +1221,24 @@ namespace cv
...
@@ -1221,26 +1221,24 @@ namespace cv
explicit
BruteForceMatcher_GPU_base
(
DistType
distType
=
L2Dist
);
explicit
BruteForceMatcher_GPU_base
(
DistType
distType
=
L2Dist
);
// Add descriptors to train descriptor collection
.
// Add descriptors to train descriptor collection
void
add
(
const
std
::
vector
<
GpuMat
>&
descCollection
);
void
add
(
const
std
::
vector
<
GpuMat
>&
descCollection
);
// Get train descriptors collection
.
// Get train descriptors collection
const
std
::
vector
<
GpuMat
>&
getTrainDescriptors
()
const
;
const
std
::
vector
<
GpuMat
>&
getTrainDescriptors
()
const
;
// Clear train descriptors collection
.
// Clear train descriptors collection
void
clear
();
void
clear
();
// Return true if there are not train descriptors in collection
.
// Return true if there are not train descriptors in collection
bool
empty
()
const
;
bool
empty
()
const
;
// Return true if the matcher supports mask in match methods
.
// Return true if the matcher supports mask in match methods
bool
isMaskSupported
()
const
;
bool
isMaskSupported
()
const
;
// Find one best match for each query descriptor.
// Find one best match for each query descriptor
// trainIdx.at<int>(0, queryIdx) will contain best train index for queryIdx
void
matchSingle
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
// distance.at<float>(0, queryIdx) will contain distance
GpuMat
&
trainIdx
,
GpuMat
&
distance
,
void
matchSingle
(
const
GpuMat
&
queryDescs
,
const
GpuMat
&
trainDescs
,
GpuMat
&
trainIdx
,
GpuMat
&
distance
,
const
GpuMat
&
mask
=
GpuMat
(),
Stream
&
stream
=
Stream
::
Null
());
const
GpuMat
&
mask
=
GpuMat
(),
Stream
&
stream
=
Stream
::
Null
());
// Download trainIdx and distance and convert it to CPU vector with DMatch
// Download trainIdx and distance and convert it to CPU vector with DMatch
...
@@ -1248,21 +1246,16 @@ namespace cv
...
@@ -1248,21 +1246,16 @@ namespace cv
// Convert trainIdx and distance to vector with DMatch
// Convert trainIdx and distance to vector with DMatch
static
void
matchConvert
(
const
Mat
&
trainIdx
,
const
Mat
&
distance
,
std
::
vector
<
DMatch
>&
matches
);
static
void
matchConvert
(
const
Mat
&
trainIdx
,
const
Mat
&
distance
,
std
::
vector
<
DMatch
>&
matches
);
// Find one best match for each query descriptor.
// Find one best match for each query descriptor
void
match
(
const
GpuMat
&
queryDescs
,
const
GpuMat
&
trainDescs
,
std
::
vector
<
DMatch
>&
matches
,
void
match
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
std
::
vector
<
DMatch
>&
matches
,
const
GpuMat
&
mask
=
GpuMat
());
const
GpuMat
&
mask
=
GpuMat
());
// Make gpu collection of trains and masks in suitable format for matchCollection function
// Make gpu collection of trains and masks in suitable format for matchCollection function
void
makeGpuCollection
(
GpuMat
&
trainCollection
,
GpuMat
&
maskCollection
,
void
makeGpuCollection
(
GpuMat
&
trainCollection
,
GpuMat
&
maskCollection
,
const
std
::
vector
<
GpuMat
>&
masks
=
std
::
vector
<
GpuMat
>
());
const
vector
<
GpuMat
>&
masks
=
std
::
vector
<
GpuMat
>
());
// Find one best match from train collection for each query descriptor.
// Find one best match from train collection for each query descriptor
// trainIdx.at<int>(0, queryIdx) will contain best train index for queryIdx
void
matchCollection
(
const
GpuMat
&
query
,
const
GpuMat
&
trainCollection
,
// imgIdx.at<int>(0, queryIdx) will contain best image index for queryIdx
// distance.at<float>(0, queryIdx) will contain distance
void
matchCollection
(
const
GpuMat
&
queryDescs
,
const
GpuMat
&
trainCollection
,
GpuMat
&
trainIdx
,
GpuMat
&
imgIdx
,
GpuMat
&
distance
,
GpuMat
&
trainIdx
,
GpuMat
&
imgIdx
,
GpuMat
&
distance
,
const
GpuMat
&
mask
Collection
,
Stream
&
stream
=
Stream
::
Null
());
const
GpuMat
&
mask
s
=
GpuMat
()
,
Stream
&
stream
=
Stream
::
Null
());
// Download trainIdx, imgIdx and distance and convert it to vector with DMatch
// Download trainIdx, imgIdx and distance and convert it to vector with DMatch
static
void
matchDownload
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
imgIdx
,
const
GpuMat
&
distance
,
std
::
vector
<
DMatch
>&
matches
);
static
void
matchDownload
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
imgIdx
,
const
GpuMat
&
distance
,
std
::
vector
<
DMatch
>&
matches
);
...
@@ -1270,17 +1263,12 @@ namespace cv
...
@@ -1270,17 +1263,12 @@ namespace cv
static
void
matchConvert
(
const
Mat
&
trainIdx
,
const
Mat
&
imgIdx
,
const
Mat
&
distance
,
std
::
vector
<
DMatch
>&
matches
);
static
void
matchConvert
(
const
Mat
&
trainIdx
,
const
Mat
&
imgIdx
,
const
Mat
&
distance
,
std
::
vector
<
DMatch
>&
matches
);
// Find one best match from train collection for each query descriptor.
// Find one best match from train collection for each query descriptor.
void
match
(
const
GpuMat
&
query
Descs
,
std
::
vector
<
DMatch
>&
matches
,
const
std
::
vector
<
GpuMat
>&
masks
=
std
::
vector
<
GpuMat
>
());
void
match
(
const
GpuMat
&
query
,
std
::
vector
<
DMatch
>&
matches
,
const
std
::
vector
<
GpuMat
>&
masks
=
std
::
vector
<
GpuMat
>
());
// Find k best matches for each query descriptor (in increasing order of distances).
// Find k best matches for each query descriptor (in increasing order of distances)
// trainIdx.at<int>(queryIdx, i) will contain index of i'th best trains (i < k).
void
knnMatchSingle
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
// distance.at<float>(queryIdx, i) will contain distance.
GpuMat
&
trainIdx
,
GpuMat
&
distance
,
GpuMat
&
allDist
,
int
k
,
// allDist is a buffer to store all distance between query descriptors and train descriptors
const
GpuMat
&
mask
=
GpuMat
(),
Stream
&
stream
=
Stream
::
Null
());
// it have size (nQuery,nTrain) and CV_32F type
// allDist.at<float>(queryIdx, trainIdx) will contain FLT_MAX, if trainIdx is one from k best,
// otherwise it will contain distance between queryIdx and trainIdx descriptors
void
knnMatch
(
const
GpuMat
&
queryDescs
,
const
GpuMat
&
trainDescs
,
GpuMat
&
trainIdx
,
GpuMat
&
distance
,
GpuMat
&
allDist
,
int
k
,
const
GpuMat
&
mask
=
GpuMat
(),
Stream
&
stream
=
Stream
::
Null
());
// Download trainIdx and distance and convert it to vector with DMatch
// Download trainIdx and distance and convert it to vector with DMatch
// compactResult is used when mask is not empty. If compactResult is false matches
// compactResult is used when mask is not empty. If compactResult is false matches
...
@@ -1296,27 +1284,40 @@ namespace cv
...
@@ -1296,27 +1284,40 @@ namespace cv
// compactResult is used when mask is not empty. If compactResult is false matches
// compactResult is used when mask is not empty. If compactResult is false matches
// vector will have the same size as queryDescriptors rows. If compactResult is true
// vector will have the same size as queryDescriptors rows. If compactResult is true
// matches vector will not contain matches for fully masked out query descriptors.
// matches vector will not contain matches for fully masked out query descriptors.
void
knnMatch
(
const
GpuMat
&
query
Descs
,
const
GpuMat
&
trainDescs
,
void
knnMatch
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
matches
,
int
k
,
const
GpuMat
&
mask
=
GpuMat
(),
std
::
vector
<
std
::
vector
<
DMatch
>
>&
matches
,
int
k
,
const
GpuMat
&
mask
=
GpuMat
(),
bool
compactResult
=
false
);
bool
compactResult
=
false
);
// Find k best matches from train collection for each query descriptor (in increasing order of distances)
void
knnMatch2Collection
(
const
GpuMat
&
query
,
const
GpuMat
&
trainCollection
,
GpuMat
&
trainIdx
,
GpuMat
&
imgIdx
,
GpuMat
&
distance
,
const
GpuMat
&
maskCollection
=
GpuMat
(),
Stream
&
stream
=
Stream
::
Null
());
// Download trainIdx and distance and convert it to vector with DMatch
// compactResult is used when mask is not empty. If compactResult is false matches
// vector will have the same size as queryDescriptors rows. If compactResult is true
// matches vector will not contain matches for fully masked out query descriptors.
static
void
knnMatch2Download
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
imgIdx
,
const
GpuMat
&
distance
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
matches
,
bool
compactResult
=
false
);
// Convert trainIdx and distance to vector with DMatch
static
void
knnMatch2Convert
(
const
Mat
&
trainIdx
,
const
Mat
&
imgIdx
,
const
Mat
&
distance
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
matches
,
bool
compactResult
=
false
);
// Find k best matches for each query descriptor (in increasing order of distances).
// Find k best matches for each query descriptor (in increasing order of distances).
// compactResult is used when mask is not empty. If compactResult is false matches
// compactResult is used when mask is not empty. If compactResult is false matches
// vector will have the same size as queryDescriptors rows. If compactResult is true
// vector will have the same size as queryDescriptors rows. If compactResult is true
// matches vector will not contain matches for fully masked out query descriptors.
// matches vector will not contain matches for fully masked out query descriptors.
void
knnMatch
(
const
GpuMat
&
query
Descs
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
matches
,
int
knn
,
void
knnMatch
(
const
GpuMat
&
query
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
matches
,
int
k
,
const
std
::
vector
<
GpuMat
>&
masks
=
std
::
vector
<
GpuMat
>
(),
bool
compactResult
=
false
);
const
std
::
vector
<
GpuMat
>&
masks
=
std
::
vector
<
GpuMat
>
(),
bool
compactResult
=
false
);
// Find best matches for each query descriptor which have distance less than maxDistance.
// Find best matches for each query descriptor which have distance less than maxDistance.
// nMatches.at<int>(0, queryIdx) will contain matches count for queryIdx.
// nMatches.at<int>(0, queryIdx) will contain matches count for queryIdx.
// carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,
// carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,
// because it didn't have enough memory.
// because it didn't have enough memory.
// trainIdx.at<int>(queruIdx, i) will contain ith train index (i < min(nMatches.at<int>(0, queruIdx), trainIdx.cols))
// If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nTrain / 100), 10),
// distance.at<int>(queruIdx, i) will contain ith distance (i < min(nMatches.at<int>(0, queruIdx), trainIdx.cols))
// If trainIdx is empty, then trainIdx and distance will be created with size nQuery x (nTrain / 2),
// otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
// otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
// Matches doesn't sorted.
// Matches doesn't sorted.
void
radiusMatchSingle
(
const
GpuMat
&
query
Descs
,
const
GpuMat
&
trainDescs
,
void
radiusMatchSingle
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
GpuMat
&
trainIdx
,
GpuMat
&
distance
,
GpuMat
&
nMatches
,
float
maxDistance
,
GpuMat
&
trainIdx
,
GpuMat
&
distance
,
GpuMat
&
nMatches
,
float
maxDistance
,
const
GpuMat
&
mask
=
GpuMat
(),
Stream
&
stream
=
Stream
::
Null
());
const
GpuMat
&
mask
=
GpuMat
(),
Stream
&
stream
=
Stream
::
Null
());
...
@@ -1333,15 +1334,16 @@ namespace cv
...
@@ -1333,15 +1334,16 @@ namespace cv
// Find best matches for each query descriptor which have distance less than maxDistance
// Find best matches for each query descriptor which have distance less than maxDistance
// in increasing order of distances).
// in increasing order of distances).
void
radiusMatch
(
const
GpuMat
&
query
Descs
,
const
GpuMat
&
trainDescs
,
void
radiusMatch
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
matches
,
float
maxDistance
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
matches
,
float
maxDistance
,
const
GpuMat
&
mask
=
GpuMat
(),
bool
compactResult
=
false
);
const
GpuMat
&
mask
=
GpuMat
(),
bool
compactResult
=
false
);
// Find best matches for each query descriptor which have distance less than maxDistance.
// Find best matches for each query descriptor which have distance less than maxDistance.
// If trainIdx is empty, then trainIdx and distance will be created with size nQuery x max((nQuery / 100), 10),
// otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
// Matches doesn't sorted.
// Matches doesn't sorted.
void
radiusMatchCollection
(
const
GpuMat
&
queryDescs
,
const
GpuMat
&
trainCollection
,
void
radiusMatchCollection
(
const
GpuMat
&
query
,
GpuMat
&
trainIdx
,
GpuMat
&
imgIdx
,
GpuMat
&
distance
,
GpuMat
&
nMatches
,
float
maxDistance
,
GpuMat
&
trainIdx
,
GpuMat
&
imgIdx
,
GpuMat
&
distance
,
GpuMat
&
nMatches
,
float
maxDistance
,
const
std
::
vector
<
GpuMat
>&
masks
=
std
::
vector
<
GpuMat
>
(),
Stream
&
stream
=
Stream
::
Null
());
const
GpuMat
&
maskCollection
,
Stream
&
stream
=
Stream
::
Null
());
// Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch.
// Download trainIdx, imgIdx, nMatches and distance and convert it to vector with DMatch.
// matches will be sorted in increasing order of distances.
// matches will be sorted in increasing order of distances.
...
@@ -1356,7 +1358,7 @@ namespace cv
...
@@ -1356,7 +1358,7 @@ namespace cv
// Find best matches from train collection for each query descriptor which have distance less than
// Find best matches from train collection for each query descriptor which have distance less than
// maxDistance (in increasing order of distances).
// maxDistance (in increasing order of distances).
void
radiusMatch
(
const
GpuMat
&
query
Descs
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
matches
,
float
maxDistance
,
void
radiusMatch
(
const
GpuMat
&
query
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
matches
,
float
maxDistance
,
const
std
::
vector
<
GpuMat
>&
masks
=
std
::
vector
<
GpuMat
>
(),
bool
compactResult
=
false
);
const
std
::
vector
<
GpuMat
>&
masks
=
std
::
vector
<
GpuMat
>
(),
bool
compactResult
=
false
);
DistType
distType
;
DistType
distType
;
...
...
modules/gpu/perf/perf_features2d.cpp
View file @
d3c4e907
#include "perf_precomp.hpp"
#include "perf_precomp.hpp"
PERF_TEST_P
(
DevInfo_DescSize
,
BruteForceMatcher_match
,
testing
::
Combine
(
testing
::
ValuesIn
(
devices
()),
PERF_TEST_P
(
DevInfo_DescSize
,
BruteForceMatcher_match
,
testing
::
Combine
(
testing
::
ValuesIn
(
devices
()),
testing
::
Values
(
64
,
128
)))
testing
::
Values
(
64
,
128
,
256
)))
{
{
DeviceInfo
devInfo
=
std
::
tr1
::
get
<
0
>
(
GetParam
());
DeviceInfo
devInfo
=
std
::
tr1
::
get
<
0
>
(
GetParam
());
int
desc_size
=
std
::
tr1
::
get
<
1
>
(
GetParam
());
int
desc_size
=
std
::
tr1
::
get
<
1
>
(
GetParam
());
...
@@ -19,7 +19,7 @@ PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_match, testing::Combine(testing:
...
@@ -19,7 +19,7 @@ PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_match, testing::Combine(testing:
BruteForceMatcher_GPU
<
L2
<
float
>
>
matcher
;
BruteForceMatcher_GPU
<
L2
<
float
>
>
matcher
;
declare
.
time
(
0.5
).
iterations
(
10
0
);
declare
.
time
(
3.
0
);
SIMPLE_TEST_CYCLE
()
SIMPLE_TEST_CYCLE
()
{
{
...
@@ -35,7 +35,7 @@ PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_match, testing::Combine(testing:
...
@@ -35,7 +35,7 @@ PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_match, testing::Combine(testing:
PERF_TEST_P
(
DevInfo_K_DescSize
,
BruteForceMatcher_knnMatch
,
testing
::
Combine
(
testing
::
ValuesIn
(
devices
()),
PERF_TEST_P
(
DevInfo_K_DescSize
,
BruteForceMatcher_knnMatch
,
testing
::
Combine
(
testing
::
ValuesIn
(
devices
()),
testing
::
Values
(
2
,
3
),
testing
::
Values
(
2
,
3
),
testing
::
Values
(
64
,
128
)))
testing
::
Values
(
64
,
128
,
256
)))
{
{
DeviceInfo
devInfo
=
std
::
tr1
::
get
<
0
>
(
GetParam
());
DeviceInfo
devInfo
=
std
::
tr1
::
get
<
0
>
(
GetParam
());
int
k
=
std
::
tr1
::
get
<
1
>
(
GetParam
());
int
k
=
std
::
tr1
::
get
<
1
>
(
GetParam
());
...
@@ -54,11 +54,11 @@ PERF_TEST_P(DevInfo_K_DescSize, BruteForceMatcher_knnMatch, testing::Combine(tes
...
@@ -54,11 +54,11 @@ PERF_TEST_P(DevInfo_K_DescSize, BruteForceMatcher_knnMatch, testing::Combine(tes
BruteForceMatcher_GPU
<
L2
<
float
>
>
matcher
;
BruteForceMatcher_GPU
<
L2
<
float
>
>
matcher
;
declare
.
time
(
0.5
).
iterations
(
10
0
);
declare
.
time
(
3.
0
);
SIMPLE_TEST_CYCLE
()
SIMPLE_TEST_CYCLE
()
{
{
matcher
.
knnMatch
(
query
,
train
,
trainIdx
,
distance
,
allDist
,
k
);
matcher
.
knnMatch
Single
(
query
,
train
,
trainIdx
,
distance
,
allDist
,
k
);
}
}
Mat
trainIdx_host
(
trainIdx
);
Mat
trainIdx_host
(
trainIdx
);
...
@@ -69,7 +69,7 @@ PERF_TEST_P(DevInfo_K_DescSize, BruteForceMatcher_knnMatch, testing::Combine(tes
...
@@ -69,7 +69,7 @@ PERF_TEST_P(DevInfo_K_DescSize, BruteForceMatcher_knnMatch, testing::Combine(tes
}
}
PERF_TEST_P
(
DevInfo_DescSize
,
BruteForceMatcher_radiusMatch
,
testing
::
Combine
(
testing
::
ValuesIn
(
devices
(
SHARED_ATOMICS
)),
PERF_TEST_P
(
DevInfo_DescSize
,
BruteForceMatcher_radiusMatch
,
testing
::
Combine
(
testing
::
ValuesIn
(
devices
(
SHARED_ATOMICS
)),
testing
::
Values
(
64
,
128
)))
testing
::
Values
(
64
,
128
,
256
)))
{
{
DeviceInfo
devInfo
=
std
::
tr1
::
get
<
0
>
(
GetParam
());
DeviceInfo
devInfo
=
std
::
tr1
::
get
<
0
>
(
GetParam
());
int
desc_size
=
std
::
tr1
::
get
<
1
>
(
GetParam
());
int
desc_size
=
std
::
tr1
::
get
<
1
>
(
GetParam
());
...
@@ -85,7 +85,7 @@ PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_radiusMatch, testing::Combine(te
...
@@ -85,7 +85,7 @@ PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_radiusMatch, testing::Combine(te
BruteForceMatcher_GPU
<
L2
<
float
>
>
matcher
;
BruteForceMatcher_GPU
<
L2
<
float
>
>
matcher
;
declare
.
time
(
0.5
).
iterations
(
10
0
);
declare
.
time
(
3.
0
);
SIMPLE_TEST_CYCLE
()
SIMPLE_TEST_CYCLE
()
{
{
...
...
modules/gpu/src/brute_force_matcher.cpp
View file @
d3c4e907
...
@@ -56,86 +56,101 @@ bool cv::gpu::BruteForceMatcher_GPU_base::empty() const { throw_nogpu(); return
...
@@ -56,86 +56,101 @@ bool cv::gpu::BruteForceMatcher_GPU_base::empty() const { throw_nogpu(); return
bool
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
isMaskSupported
()
const
{
throw_nogpu
();
return
true
;
}
bool
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
isMaskSupported
()
const
{
throw_nogpu
();
return
true
;
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchSingle
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
const
GpuMat
&
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchSingle
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
const
GpuMat
&
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchDownload
(
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
DMatch
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchDownload
(
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
DMatch
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchConvert
(
const
Mat
&
,
const
Mat
&
,
std
::
vector
<
DMatch
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchConvert
(
const
Mat
&
,
const
Mat
&
,
vector
<
DMatch
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
match
(
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
DMatch
>&
,
const
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
match
(
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
DMatch
>&
,
const
GpuMat
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
makeGpuCollection
(
GpuMat
&
,
GpuMat
&
,
const
vector
<
GpuMat
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
makeGpuCollection
(
GpuMat
&
,
GpuMat
&
,
const
vector
<
GpuMat
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchCollection
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
const
GpuMat
&
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchCollection
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
const
GpuMat
&
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchDownload
(
const
GpuMat
&
,
const
GpuMat
&
,
const
GpuMat
&
,
std
::
vector
<
DMatch
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchDownload
(
const
GpuMat
&
,
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
DMatch
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchConvert
(
const
Mat
&
,
const
Mat
&
,
const
Mat
&
,
std
::
vector
<
DMatch
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchConvert
(
const
Mat
&
,
const
Mat
&
,
const
Mat
&
,
vector
<
DMatch
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
match
(
const
GpuMat
&
,
std
::
vector
<
DMatch
>&
,
const
std
::
vector
<
GpuMat
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
match
(
const
GpuMat
&
,
vector
<
DMatch
>&
,
const
vector
<
GpuMat
>&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
int
,
const
GpuMat
&
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatchSingle
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
int
,
const
GpuMat
&
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatchDownload
(
const
GpuMat
&
,
const
GpuMat
&
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatchDownload
(
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatchConvert
(
const
Mat
&
,
const
Mat
&
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatchConvert
(
const
Mat
&
,
const
Mat
&
,
vector
<
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch
(
const
GpuMat
&
,
const
GpuMat
&
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
,
int
,
const
GpuMat
&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch
(
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
vector
<
DMatch
>
>&
,
int
,
const
GpuMat
&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch
(
const
GpuMat
&
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
,
int
,
const
std
::
vector
<
GpuMat
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch2Collection
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
const
GpuMat
&
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch2Download
(
const
GpuMat
&
,
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch2Convert
(
const
Mat
&
,
const
Mat
&
,
const
Mat
&
,
vector
<
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch
(
const
GpuMat
&
,
vector
<
vector
<
DMatch
>
>&
,
int
,
const
vector
<
GpuMat
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchSingle
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
float
,
const
GpuMat
&
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchSingle
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
float
,
const
GpuMat
&
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchDownload
(
const
GpuMat
&
,
const
GpuMat
&
,
const
GpuMat
&
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchDownload
(
const
GpuMat
&
,
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchConvert
(
const
Mat
&
,
const
Mat
&
,
const
Mat
&
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchConvert
(
const
Mat
&
,
const
Mat
&
,
const
Mat
&
,
vector
<
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatch
(
const
GpuMat
&
,
const
GpuMat
&
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
,
float
,
const
GpuMat
&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatch
(
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
vector
<
DMatch
>
>&
,
float
,
const
GpuMat
&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchCollection
(
const
GpuMat
&
,
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
float
,
const
GpuMat
&
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchCollection
(
const
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
GpuMat
&
,
float
,
const
vector
<
GpuMat
>
&
,
Stream
&
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchDownload
(
const
GpuMat
&
,
const
GpuMat
&
,
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchDownload
(
const
GpuMat
&
,
const
GpuMat
&
,
const
GpuMat
&
,
const
GpuMat
&
,
vector
<
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchConvert
(
const
Mat
&
,
const
Mat
&
,
const
Mat
&
,
const
Mat
&
,
vector
<
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchConvert
(
const
Mat
&
,
const
Mat
&
,
const
Mat
&
,
const
Mat
&
,
vector
<
vector
<
DMatch
>
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatch
(
const
GpuMat
&
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
,
float
,
const
std
::
vector
<
GpuMat
>&
,
bool
)
{
throw_nogpu
();
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatch
(
const
GpuMat
&
,
vector
<
vector
<
DMatch
>
>&
,
float
,
const
vector
<
GpuMat
>&
,
bool
)
{
throw_nogpu
();
}
#else
/* !defined (HAVE_CUDA) */
#else
/* !defined (HAVE_CUDA) */
namespace
cv
{
namespace
gpu
{
namespace
bf_match
namespace
cv
{
namespace
gpu
{
namespace
bf_match
{
{
template
<
typename
T
>
void
match
Single
L1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
const
DevMem2D
&
mask
,
template
<
typename
T
>
void
matchL1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
i
&
trainIdx
,
const
DevMem2Df
&
distance
,
int
cc
,
cudaStream_t
stream
);
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
match
Single
L2_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
const
DevMem2D
&
mask
,
template
<
typename
T
>
void
matchL2_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
i
&
trainIdx
,
const
DevMem2Df
&
distance
,
int
cc
,
cudaStream_t
stream
);
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
match
Single
Hamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
const
DevMem2D
&
mask
,
template
<
typename
T
>
void
matchHamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
i
&
trainIdx
,
const
DevMem2Df
&
distance
,
int
cc
,
cudaStream_t
stream
);
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
match
CollectionL1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trainCollection
,
const
DevMem2D_
<
PtrStep
>&
maskCollection
,
template
<
typename
T
>
void
match
L1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trains
,
const
DevMem2D_
<
PtrStep
>&
masks
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
i
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
,
int
cc
,
cudaStream_t
stream
);
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
match
CollectionL2_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trainCollection
,
const
DevMem2D_
<
PtrStep
>&
maskCollection
,
template
<
typename
T
>
void
match
L2_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trains
,
const
DevMem2D_
<
PtrStep
>&
masks
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
i
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
,
int
cc
,
cudaStream_t
stream
);
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
match
CollectionHamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trainCollection
,
const
DevMem2D_
<
PtrStep
>&
maskCollection
,
template
<
typename
T
>
void
match
Hamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trains
,
const
DevMem2D_
<
PtrStep
>&
masks
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
i
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
,
int
cc
,
cudaStream_t
stream
);
int
cc
,
cudaStream_t
stream
);
}}}
}}}
namespace
cv
{
namespace
gpu
{
namespace
bf_knnmatch
namespace
cv
{
namespace
gpu
{
namespace
bf_knnmatch
{
{
template
<
typename
T
>
void
knnMatchL1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
int
k
,
const
DevMem2D
&
mask
,
template
<
typename
T
>
void
matchL1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
int
k
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
allDist
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2Df
&
allDist
,
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
matchL2_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
int
k
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2Df
&
allDist
,
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
matchHamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
int
k
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2Df
&
allDist
,
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
match2L1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trains
,
const
DevMem2D_
<
PtrStep
>&
masks
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
int
cc
,
cudaStream_t
stream
);
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
knnMatchL2_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
int
k
,
const
DevMem2D
&
mask
,
template
<
typename
T
>
void
match2L2_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trains
,
const
DevMem2D_
<
PtrStep
>&
masks
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
allDist
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
int
cc
,
cudaStream_t
stream
);
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
knnMatchHamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
int
k
,
const
DevMem2D
&
mask
,
template
<
typename
T
>
void
match2Hamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trains
,
const
DevMem2D_
<
PtrStep
>&
masks
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
allDist
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
int
cc
,
cudaStream_t
stream
);
int
cc
,
cudaStream_t
stream
);
}}}
}}}
namespace
cv
{
namespace
gpu
{
namespace
bf_radius_match
namespace
cv
{
namespace
gpu
{
namespace
bf_radius_match
{
{
template
<
typename
T
>
void
radiusMatchSingleL1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
float
maxDistance
,
const
DevMem2D
&
mask
,
template
<
typename
T
>
void
matchL1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
float
maxDistance
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
nMatches
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Df
&
distance
,
const
DevMem2D_
<
unsigned
int
>&
nMatches
,
cudaStream_t
stream
);
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
radiusMatchSingleL2_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
float
maxDistance
,
const
DevMem2D
&
mask
,
template
<
typename
T
>
void
matchL2_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
float
maxDistance
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
nMatches
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Df
&
distance
,
const
DevMem2D_
<
unsigned
int
>&
nMatches
,
cudaStream_t
stream
);
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
radiusMatchSingleHamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
float
maxDistance
,
const
DevMem2D
&
mask
,
template
<
typename
T
>
void
matchHamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
float
maxDistance
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
nMatches
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Df
&
distance
,
const
DevMem2D_
<
unsigned
int
>&
nMatches
,
cudaStream_t
stream
);
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
radiusMatchCollectionL1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trainCollection
,
float
maxDistance
,
const
DevMem2D_
<
PtrStep
>&
maskCollection
,
template
<
typename
T
>
void
matchL1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
*
trains
,
int
n
,
float
maxDistance
,
const
DevMem2D
*
masks
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
nMatches
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
,
const
DevMem2D_
<
unsigned
int
>&
nMatches
,
cudaStream_t
stream
);
int
cc
,
cudaStream_t
stream
);
template
<
typename
T
>
void
radiusMatchCollectionL2_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trainCollection
,
float
maxDistance
,
const
DevMem2D_
<
PtrStep
>&
maskCollection
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
nMatches
,
template
<
typename
T
>
void
matchL2_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
*
trains
,
int
n
,
float
maxDistance
,
const
DevMem2D
*
masks
,
cudaStream_t
stream
);
const
DevMem2Di
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
,
const
DevMem2D_
<
unsigned
int
>&
nMatches
,
template
<
typename
T
>
void
radiusMatchCollectionHamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trainCollection
,
float
maxDistance
,
const
DevMem2D_
<
PtrStep
>&
maskCollection
,
int
cc
,
cudaStream_t
stream
);
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
nMatches
,
cudaStream_t
stream
);
template
<
typename
T
>
void
matchHamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
*
trains
,
int
n
,
float
maxDistance
,
const
DevMem2D
*
masks
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
,
const
DevMem2D_
<
unsigned
int
>&
nMatches
,
int
cc
,
cudaStream_t
stream
);
}}}
}}}
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
BruteForceMatcher_GPU_base
(
DistType
distType_
)
:
distType
(
distType_
)
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
BruteForceMatcher_GPU_base
(
DistType
distType_
)
:
distType
(
distType_
)
...
@@ -173,52 +188,53 @@ bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const
...
@@ -173,52 +188,53 @@ bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const
////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////
// Match
// Match
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchSingle
(
const
GpuMat
&
queryDescs
,
const
GpuMat
&
trainDescs
,
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchSingle
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
GpuMat
&
trainIdx
,
GpuMat
&
distance
,
const
GpuMat
&
mask
,
Stream
&
stream
)
GpuMat
&
trainIdx
,
GpuMat
&
distance
,
const
GpuMat
&
mask
,
Stream
&
stream
)
{
{
if
(
query
Descs
.
empty
()
||
trainDescs
.
empty
())
if
(
query
.
empty
()
||
train
.
empty
())
return
;
return
;
using
namespace
cv
::
gpu
::
bf_match
;
using
namespace
cv
::
gpu
::
bf_match
;
typedef
void
(
*
match_
caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
const
DevMem2D
&
mask
,
typedef
void
(
*
caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Df
&
distance
,
int
cc
,
cudaStream_t
stream
);
int
cc
,
cudaStream_t
stream
);
static
const
match_caller_t
match_callers
[
3
][
8
]
=
static
const
caller_t
callers
[
3
][
6
]
=
{
{
{
{
match
SingleL1_gpu
<
unsigned
char
>
,
0
/*matchSingle
L1_gpu<signed char>*/
,
match
L1_gpu
<
unsigned
char
>
,
0
/*match
L1_gpu<signed char>*/
,
match
SingleL1_gpu
<
unsigned
short
>
,
matchSingle
L1_gpu
<
short
>
,
match
L1_gpu
<
unsigned
short
>
,
match
L1_gpu
<
short
>
,
match
SingleL1_gpu
<
int
>
,
matchSingleL1_gpu
<
float
>
,
0
,
0
match
L1_gpu
<
int
>
,
matchL1_gpu
<
float
>
},
},
{
{
0
/*match
SingleL2_gpu<unsigned char>*/
,
0
/*matchSingle
L2_gpu<signed char>*/
,
0
/*match
L2_gpu<unsigned char>*/
,
0
/*match
L2_gpu<signed char>*/
,
0
/*match
SingleL2_gpu<unsigned short>*/
,
0
/*matchSingle
L2_gpu<short>*/
,
0
/*match
L2_gpu<unsigned short>*/
,
0
/*match
L2_gpu<short>*/
,
0
/*match
SingleL2_gpu<int>*/
,
matchSingleL2_gpu
<
float
>
,
0
,
0
0
/*match
L2_gpu<int>*/
,
matchL2_gpu
<
float
>
},
},
{
{
match
SingleHamming_gpu
<
unsigned
char
>
,
0
/*matchSingle
Hamming_gpu<signed char>*/
,
match
Hamming_gpu
<
unsigned
char
>
,
0
/*match
Hamming_gpu<signed char>*/
,
match
SingleHamming_gpu
<
unsigned
short
>
,
0
/*matchSingle
Hamming_gpu<short>*/
,
match
Hamming_gpu
<
unsigned
short
>
,
0
/*match
Hamming_gpu<short>*/
,
match
SingleHamming_gpu
<
int
>
,
0
,
0
,
0
match
Hamming_gpu
<
int
>
,
0
/*matchHamming_gpu<float>*/
}
}
};
};
CV_Assert
(
query
Descs
.
channels
()
==
1
&&
queryDescs
.
depth
()
<
CV_64F
);
CV_Assert
(
query
.
channels
()
==
1
&&
query
.
depth
()
<
CV_64F
);
CV_Assert
(
train
Descs
.
cols
==
queryDescs
.
cols
&&
trainDescs
.
type
()
==
queryDescs
.
type
());
CV_Assert
(
train
.
cols
==
query
.
cols
&&
train
.
type
()
==
query
.
type
());
const
int
nQuery
=
query
Descs
.
rows
;
const
int
nQuery
=
query
.
rows
;
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32S
,
trainIdx
);
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32S
,
trainIdx
);
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32F
,
distance
);
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32F
,
distance
);
match_caller_t
func
=
match_callers
[
distType
][
queryDescs
.
depth
()];
caller_t
func
=
callers
[
distType
][
query
.
depth
()];
CV_Assert
(
func
!=
0
);
CV_Assert
(
func
!=
0
);
DeviceInfo
info
;
DeviceInfo
info
;
int
cc
=
info
.
majorVersion
()
*
10
+
info
.
minorVersion
();
int
cc
=
info
.
majorVersion
()
*
10
+
info
.
minorVersion
();
func
(
query
Descs
,
trainDescs
,
mask
,
trainIdx
,
distance
,
cc
,
StreamAccessor
::
getStream
(
stream
));
func
(
query
,
train
,
mask
,
trainIdx
,
distance
,
cc
,
StreamAccessor
::
getStream
(
stream
));
}
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchDownload
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
distance
,
vector
<
DMatch
>&
matches
)
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchDownload
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
distance
,
vector
<
DMatch
>&
matches
)
...
@@ -232,13 +248,13 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx,
...
@@ -232,13 +248,13 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx,
matchConvert
(
trainIdxCPU
,
distanceCPU
,
matches
);
matchConvert
(
trainIdxCPU
,
distanceCPU
,
matches
);
}
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchConvert
(
const
Mat
&
trainIdx
,
const
Mat
&
distance
,
std
::
vector
<
DMatch
>&
matches
)
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchConvert
(
const
Mat
&
trainIdx
,
const
Mat
&
distance
,
vector
<
DMatch
>&
matches
)
{
{
if
(
trainIdx
.
empty
()
||
distance
.
empty
())
if
(
trainIdx
.
empty
()
||
distance
.
empty
())
return
;
return
;
CV_Assert
(
trainIdx
.
type
()
==
CV_32SC1
&&
trainIdx
.
isContinuous
()
);
CV_Assert
(
trainIdx
.
type
()
==
CV_32SC1
);
CV_Assert
(
distance
.
type
()
==
CV_32FC1
&&
distance
.
isContinuous
()
&&
distance
.
cols
==
trainIdx
.
cols
);
CV_Assert
(
distance
.
type
()
==
CV_32FC1
&&
distance
.
cols
==
trainIdx
.
cols
);
const
int
nQuery
=
trainIdx
.
cols
;
const
int
nQuery
=
trainIdx
.
cols
;
...
@@ -250,6 +266,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons
...
@@ -250,6 +266,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons
for
(
int
queryIdx
=
0
;
queryIdx
<
nQuery
;
++
queryIdx
,
++
trainIdx_ptr
,
++
distance_ptr
)
for
(
int
queryIdx
=
0
;
queryIdx
<
nQuery
;
++
queryIdx
,
++
trainIdx_ptr
,
++
distance_ptr
)
{
{
int
trainIdx
=
*
trainIdx_ptr
;
int
trainIdx
=
*
trainIdx_ptr
;
if
(
trainIdx
==
-
1
)
if
(
trainIdx
==
-
1
)
continue
;
continue
;
...
@@ -261,11 +278,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons
...
@@ -261,11 +278,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons
}
}
}
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
match
(
const
GpuMat
&
query
Descs
,
const
GpuMat
&
trainDescs
,
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
match
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
vector
<
DMatch
>&
matches
,
const
GpuMat
&
mask
)
vector
<
DMatch
>&
matches
,
const
GpuMat
&
mask
)
{
{
GpuMat
trainIdx
,
distance
;
GpuMat
trainIdx
,
distance
;
matchSingle
(
query
Descs
,
trainDescs
,
trainIdx
,
distance
,
mask
);
matchSingle
(
query
,
train
,
trainIdx
,
distance
,
mask
);
matchDownload
(
trainIdx
,
distance
,
matches
);
matchDownload
(
trainIdx
,
distance
,
matches
);
}
}
...
@@ -279,14 +296,13 @@ void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat& trainCollect
...
@@ -279,14 +296,13 @@ void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat& trainCollect
{
{
Mat
trainCollectionCPU
(
1
,
static_cast
<
int
>
(
trainDescCollection
.
size
()),
CV_8UC
(
sizeof
(
DevMem2D
)));
Mat
trainCollectionCPU
(
1
,
static_cast
<
int
>
(
trainDescCollection
.
size
()),
CV_8UC
(
sizeof
(
DevMem2D
)));
for
(
size_t
i
=
0
;
i
<
trainDescCollection
.
size
();
++
i
)
DevMem2D
*
trainCollectionCPU_ptr
=
trainCollectionCPU
.
ptr
<
DevMem2D
>
();
{
const
GpuMat
&
trainDescs
=
trainDescCollection
[
i
];
trainCollectionCPU
.
ptr
<
DevMem2D
>
(
0
)[
i
]
=
trainDescs
;
for
(
size_t
i
=
0
,
size
=
trainDescCollection
.
size
();
i
<
size
;
++
i
,
++
trainCollectionCPU_ptr
)
}
*
trainCollectionCPU_ptr
=
trainDescCollection
[
i
];
trainCollection
.
upload
(
trainCollectionCPU
);
trainCollection
.
upload
(
trainCollectionCPU
);
maskCollection
.
release
();
}
}
else
else
{
{
...
@@ -295,16 +311,18 @@ void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat& trainCollect
...
@@ -295,16 +311,18 @@ void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat& trainCollect
Mat
trainCollectionCPU
(
1
,
static_cast
<
int
>
(
trainDescCollection
.
size
()),
CV_8UC
(
sizeof
(
DevMem2D
)));
Mat
trainCollectionCPU
(
1
,
static_cast
<
int
>
(
trainDescCollection
.
size
()),
CV_8UC
(
sizeof
(
DevMem2D
)));
Mat
maskCollectionCPU
(
1
,
static_cast
<
int
>
(
trainDescCollection
.
size
()),
CV_8UC
(
sizeof
(
PtrStep
)));
Mat
maskCollectionCPU
(
1
,
static_cast
<
int
>
(
trainDescCollection
.
size
()),
CV_8UC
(
sizeof
(
PtrStep
)));
for
(
size_t
i
=
0
;
i
<
trainDescCollection
.
size
();
++
i
)
DevMem2D
*
trainCollectionCPU_ptr
=
trainCollectionCPU
.
ptr
<
DevMem2D
>
();
PtrStep
*
maskCollectionCPU_ptr
=
maskCollectionCPU
.
ptr
<
PtrStep
>
();
for
(
size_t
i
=
0
,
size
=
trainDescCollection
.
size
();
i
<
size
;
++
i
,
++
trainCollectionCPU_ptr
,
++
maskCollectionCPU_ptr
)
{
{
const
GpuMat
&
train
Descs
=
trainDescCollection
[
i
];
const
GpuMat
&
train
=
trainDescCollection
[
i
];
const
GpuMat
&
mask
=
masks
[
i
];
const
GpuMat
&
mask
=
masks
[
i
];
CV_Assert
(
mask
.
empty
()
||
(
mask
.
type
()
==
CV_8UC1
&&
mask
.
cols
==
train
Descs
.
rows
));
CV_Assert
(
mask
.
empty
()
||
(
mask
.
type
()
==
CV_8UC1
&&
mask
.
cols
==
train
.
rows
));
trainCollectionCPU
.
ptr
<
DevMem2D
>
(
0
)[
i
]
=
trainDescs
;
*
trainCollectionCPU_ptr
=
train
;
*
maskCollectionCPU_ptr
=
mask
;
maskCollectionCPU
.
ptr
<
PtrStep
>
(
0
)[
i
]
=
mask
;
}
}
trainCollection
.
upload
(
trainCollectionCPU
);
trainCollection
.
upload
(
trainCollectionCPU
);
...
@@ -312,52 +330,53 @@ void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat& trainCollect
...
@@ -312,52 +330,53 @@ void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat& trainCollect
}
}
}
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchCollection
(
const
GpuMat
&
queryDescs
,
const
GpuMat
&
trainCollection
,
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchCollection
(
const
GpuMat
&
query
,
const
GpuMat
&
trainCollection
,
GpuMat
&
trainIdx
,
GpuMat
&
imgIdx
,
GpuMat
&
distance
,
const
GpuMat
&
maskCollection
,
Stream
&
stream
)
GpuMat
&
trainIdx
,
GpuMat
&
imgIdx
,
GpuMat
&
distance
,
const
GpuMat
&
masks
,
Stream
&
stream
)
{
{
if
(
query
Descs
.
empty
()
||
trainCollection
.
empty
())
if
(
query
.
empty
()
||
trainCollection
.
empty
())
return
;
return
;
using
namespace
cv
::
gpu
::
bf_match
;
using
namespace
cv
::
gpu
::
bf_match
;
typedef
void
(
*
match_caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
trainCollection
,
const
DevMem2D_
<
PtrStep
>&
maskCollection
,
typedef
void
(
*
caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
trains
,
const
DevMem2D_
<
PtrStep
>&
masks
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
,
int
cc
,
cudaStream_t
stream
);
int
cc
,
cudaStream_t
stream
);
static
const
match_caller_t
match_callers
[
3
][
8
]
=
static
const
caller_t
callers
[
3
][
6
]
=
{
{
{
{
match
CollectionL1_gpu
<
unsigned
char
>
,
0
/*matchCollection
L1_gpu<signed char>*/
,
match
L1_gpu
<
unsigned
char
>
,
0
/*match
L1_gpu<signed char>*/
,
match
CollectionL1_gpu
<
unsigned
short
>
,
matchCollection
L1_gpu
<
short
>
,
match
L1_gpu
<
unsigned
short
>
,
match
L1_gpu
<
short
>
,
match
CollectionL1_gpu
<
int
>
,
matchCollectionL1_gpu
<
float
>
,
0
,
0
match
L1_gpu
<
int
>
,
matchL1_gpu
<
float
>
},
},
{
{
0
/*match
CollectionL2_gpu<unsigned char>*/
,
0
/*matchCollection
L2_gpu<signed char>*/
,
0
/*match
L2_gpu<unsigned char>*/
,
0
/*match
L2_gpu<signed char>*/
,
0
/*match
CollectionL2_gpu<unsigned short>*/
,
0
/*matchCollection
L2_gpu<short>*/
,
0
/*match
L2_gpu<unsigned short>*/
,
0
/*match
L2_gpu<short>*/
,
0
/*match
CollectionL2_gpu<int>*/
,
matchCollectionL2_gpu
<
float
>
,
0
,
0
0
/*match
L2_gpu<int>*/
,
matchL2_gpu
<
float
>
},
},
{
{
match
CollectionHamming_gpu
<
unsigned
char
>
,
0
/*matchCollection
Hamming_gpu<signed char>*/
,
match
Hamming_gpu
<
unsigned
char
>
,
0
/*match
Hamming_gpu<signed char>*/
,
match
CollectionHamming_gpu
<
unsigned
short
>
,
0
/*matchCollection
Hamming_gpu<short>*/
,
match
Hamming_gpu
<
unsigned
short
>
,
0
/*match
Hamming_gpu<short>*/
,
match
CollectionHamming_gpu
<
int
>
,
0
,
0
,
0
match
Hamming_gpu
<
int
>
,
0
/*matchHamming_gpu<float>*/
}
}
};
};
CV_Assert
(
query
Descs
.
channels
()
==
1
&&
queryDescs
.
depth
()
<
CV_64F
);
CV_Assert
(
query
.
channels
()
==
1
&&
query
.
depth
()
<
CV_64F
);
const
int
nQuery
=
query
Descs
.
rows
;
const
int
nQuery
=
query
.
rows
;
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32S
,
trainIdx
);
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32S
,
trainIdx
);
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32S
,
imgIdx
);
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32S
,
imgIdx
);
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32F
,
distance
);
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32F
,
distance
);
match_caller_t
func
=
match_callers
[
distType
][
queryDescs
.
depth
()];
caller_t
func
=
callers
[
distType
][
query
.
depth
()];
CV_Assert
(
func
!=
0
);
CV_Assert
(
func
!=
0
);
DeviceInfo
info
;
DeviceInfo
info
;
int
cc
=
info
.
majorVersion
()
*
10
+
info
.
minorVersion
();
int
cc
=
info
.
majorVersion
()
*
10
+
info
.
minorVersion
();
func
(
query
Descs
,
trainCollection
,
maskCollection
,
trainIdx
,
imgIdx
,
distance
,
cc
,
StreamAccessor
::
getStream
(
stream
));
func
(
query
,
trainCollection
,
masks
,
trainIdx
,
imgIdx
,
distance
,
cc
,
StreamAccessor
::
getStream
(
stream
));
}
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchDownload
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
imgIdx
,
const
GpuMat
&
distance
,
vector
<
DMatch
>&
matches
)
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
matchDownload
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
imgIdx
,
const
GpuMat
&
distance
,
vector
<
DMatch
>&
matches
)
...
@@ -377,9 +396,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons
...
@@ -377,9 +396,9 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons
if
(
trainIdx
.
empty
()
||
imgIdx
.
empty
()
||
distance
.
empty
())
if
(
trainIdx
.
empty
()
||
imgIdx
.
empty
()
||
distance
.
empty
())
return
;
return
;
CV_Assert
(
trainIdx
.
type
()
==
CV_32SC1
&&
trainIdx
.
isContinuous
()
);
CV_Assert
(
trainIdx
.
type
()
==
CV_32SC1
);
CV_Assert
(
imgIdx
.
type
()
==
CV_32SC1
&&
imgIdx
.
isContinuous
()
&&
imgIdx
.
cols
==
trainIdx
.
cols
);
CV_Assert
(
imgIdx
.
type
()
==
CV_32SC1
&&
imgIdx
.
cols
==
trainIdx
.
cols
);
CV_Assert
(
distance
.
type
()
==
CV_32FC1
&&
distance
.
isContinuous
()
&&
imgIdx
.
cols
==
trainIdx
.
cols
);
CV_Assert
(
distance
.
type
()
==
CV_32FC1
&&
distance
.
cols
==
trainIdx
.
cols
);
const
int
nQuery
=
trainIdx
.
cols
;
const
int
nQuery
=
trainIdx
.
cols
;
...
@@ -392,6 +411,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons
...
@@ -392,6 +411,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons
for
(
int
queryIdx
=
0
;
queryIdx
<
nQuery
;
++
queryIdx
,
++
trainIdx_ptr
,
++
imgIdx_ptr
,
++
distance_ptr
)
for
(
int
queryIdx
=
0
;
queryIdx
<
nQuery
;
++
queryIdx
,
++
trainIdx_ptr
,
++
imgIdx_ptr
,
++
distance_ptr
)
{
{
int
trainIdx
=
*
trainIdx_ptr
;
int
trainIdx
=
*
trainIdx_ptr
;
if
(
trainIdx
==
-
1
)
if
(
trainIdx
==
-
1
)
continue
;
continue
;
...
@@ -405,7 +425,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons
...
@@ -405,7 +425,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchConvert(const Mat& trainIdx, cons
}
}
}
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
match
(
const
GpuMat
&
query
Descs
,
vector
<
DMatch
>&
matches
,
const
vector
<
GpuMat
>&
masks
)
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
match
(
const
GpuMat
&
query
,
vector
<
DMatch
>&
matches
,
const
vector
<
GpuMat
>&
masks
)
{
{
GpuMat
trainCollection
;
GpuMat
trainCollection
;
GpuMat
maskCollection
;
GpuMat
maskCollection
;
...
@@ -414,46 +434,50 @@ void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& queryDescs, vector
...
@@ -414,46 +434,50 @@ void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& queryDescs, vector
GpuMat
trainIdx
,
imgIdx
,
distance
;
GpuMat
trainIdx
,
imgIdx
,
distance
;
matchCollection
(
query
Descs
,
trainCollection
,
trainIdx
,
imgIdx
,
distance
,
maskCollection
);
matchCollection
(
query
,
trainCollection
,
trainIdx
,
imgIdx
,
distance
,
maskCollection
);
matchDownload
(
trainIdx
,
imgIdx
,
distance
,
matches
);
matchDownload
(
trainIdx
,
imgIdx
,
distance
,
matches
);
}
}
////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////
// KnnMatch
// KnnMatch
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch
(
const
GpuMat
&
queryDescs
,
const
GpuMat
&
trainDescs
,
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatchSingle
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
GpuMat
&
trainIdx
,
GpuMat
&
distance
,
GpuMat
&
allDist
,
int
k
,
const
GpuMat
&
mask
,
Stream
&
stream
)
GpuMat
&
trainIdx
,
GpuMat
&
distance
,
GpuMat
&
allDist
,
int
k
,
const
GpuMat
&
mask
,
Stream
&
stream
)
{
{
if
(
query
Descs
.
empty
()
||
trainDescs
.
empty
())
if
(
query
.
empty
()
||
train
.
empty
())
return
;
return
;
using
namespace
cv
::
gpu
::
bf_knnmatch
;
using
namespace
cv
::
gpu
::
bf_knnmatch
;
typedef
void
(
*
match_
caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
int
k
,
const
DevMem2D
&
mask
,
typedef
void
(
*
caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
int
k
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
allDist
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2Df
&
allDist
,
int
cc
,
cudaStream_t
stream
);
int
cc
,
cudaStream_t
stream
);
static
const
match_caller_t
match_callers
[
3
][
8
]
=
static
const
caller_t
callers
[
3
][
6
]
=
{
{
{
{
knnMatchL1_gpu
<
unsigned
char
>
,
0
/*knnMatchL1_gpu<signed char>*/
,
knnMatchL1_gpu
<
unsigned
short
>
,
matchL1_gpu
<
unsigned
char
>
,
0
/*matchL1_gpu<signed char>*/
,
knnMatchL1_gpu
<
short
>
,
knnMatchL1_gpu
<
int
>
,
knnMatchL1_gpu
<
float
>
,
0
,
0
matchL1_gpu
<
unsigned
short
>
,
matchL1_gpu
<
short
>
,
matchL1_gpu
<
int
>
,
matchL1_gpu
<
float
>
},
},
{
{
0
/*knnMatchL2_gpu<unsigned char>*/
,
0
/*knnMatchL2_gpu<signed char>*/
,
0
/*knnMatchL2_gpu<unsigned short>*/
,
0
/*matchL2_gpu<unsigned char>*/
,
0
/*matchL2_gpu<signed char>*/
,
0
/*knnMatchL2_gpu<short>*/
,
0
/*knnMatchL2_gpu<int>*/
,
knnMatchL2_gpu
<
float
>
,
0
,
0
0
/*matchL2_gpu<unsigned short>*/
,
0
/*matchL2_gpu<short>*/
,
0
/*matchL2_gpu<int>*/
,
matchL2_gpu
<
float
>
},
},
{
{
knnMatchHamming_gpu
<
unsigned
char
>
,
0
/*knnMatchHamming_gpu<signed char>*/
,
knnMatchHamming_gpu
<
unsigned
short
>
,
matchHamming_gpu
<
unsigned
char
>
,
0
/*matchHamming_gpu<signed char>*/
,
0
/*knnMatchHamming_gpu<short>*/
,
knnMatchHamming_gpu
<
int
>
,
0
,
0
,
0
matchHamming_gpu
<
unsigned
short
>
,
0
/*matchHamming_gpu<short>*/
,
matchHamming_gpu
<
int
>
,
0
/*matchHamming_gpu<float>*/
}
}
};
};
CV_Assert
(
query
Descs
.
channels
()
==
1
&&
queryDescs
.
depth
()
<
CV_64F
);
CV_Assert
(
query
.
channels
()
==
1
&&
query
.
depth
()
<
CV_64F
);
CV_Assert
(
train
Descs
.
type
()
==
queryDescs
.
type
()
&&
trainDescs
.
cols
==
queryDescs
.
cols
);
CV_Assert
(
train
.
type
()
==
query
.
type
()
&&
train
.
cols
==
query
.
cols
);
const
int
nQuery
=
query
Descs
.
rows
;
const
int
nQuery
=
query
.
rows
;
const
int
nTrain
=
train
Descs
.
rows
;
const
int
nTrain
=
train
.
rows
;
if
(
k
==
2
)
if
(
k
==
2
)
{
{
...
@@ -468,25 +492,17 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
...
@@ -468,25 +492,17 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
}
}
if
(
stream
)
if
(
stream
)
{
stream
.
enqueueMemSet
(
trainIdx
,
Scalar
::
all
(
-
1
));
stream
.
enqueueMemSet
(
trainIdx
,
Scalar
::
all
(
-
1
));
if
(
k
!=
2
)
stream
.
enqueueMemSet
(
allDist
,
Scalar
::
all
(
numeric_limits
<
float
>::
max
()));
}
else
else
{
trainIdx
.
setTo
(
Scalar
::
all
(
-
1
));
trainIdx
.
setTo
(
Scalar
::
all
(
-
1
));
if
(
k
!=
2
)
allDist
.
setTo
(
Scalar
::
all
(
numeric_limits
<
float
>::
max
()));
}
match_caller_t
func
=
match_callers
[
distType
][
queryDescs
.
depth
()];
caller_t
func
=
callers
[
distType
][
query
.
depth
()];
CV_Assert
(
func
!=
0
);
CV_Assert
(
func
!=
0
);
DeviceInfo
info
;
DeviceInfo
info
;
int
cc
=
info
.
majorVersion
()
*
10
+
info
.
minorVersion
();
int
cc
=
info
.
majorVersion
()
*
10
+
info
.
minorVersion
();
func
(
query
Descs
,
trainDescs
,
k
,
mask
,
trainIdx
,
distance
,
allDist
,
cc
,
StreamAccessor
::
getStream
(
stream
));
func
(
query
,
train
,
k
,
mask
,
trainIdx
,
distance
,
allDist
,
cc
,
StreamAccessor
::
getStream
(
stream
));
}
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatchDownload
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
distance
,
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatchDownload
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
distance
,
...
@@ -502,7 +518,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainId
...
@@ -502,7 +518,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainId
}
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatchConvert
(
const
Mat
&
trainIdx
,
const
Mat
&
distance
,
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatchConvert
(
const
Mat
&
trainIdx
,
const
Mat
&
distance
,
std
::
vector
<
std
::
vector
<
DMatch
>
>&
matches
,
bool
compactResult
)
vector
<
vector
<
DMatch
>
>&
matches
,
bool
compactResult
)
{
{
if
(
trainIdx
.
empty
()
||
distance
.
empty
())
if
(
trainIdx
.
empty
()
||
distance
.
empty
())
return
;
return
;
...
@@ -546,14 +562,127 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchConvert(const Mat& trainIdx, c
...
@@ -546,14 +562,127 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatchConvert(const Mat& trainIdx, c
}
}
}
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch
(
const
GpuMat
&
query
Descs
,
const
GpuMat
&
trainDescs
,
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
vector
<
vector
<
DMatch
>
>&
matches
,
int
k
,
const
GpuMat
&
mask
,
bool
compactResult
)
vector
<
vector
<
DMatch
>
>&
matches
,
int
k
,
const
GpuMat
&
mask
,
bool
compactResult
)
{
{
GpuMat
trainIdx
,
distance
,
allDist
;
GpuMat
trainIdx
,
distance
,
allDist
;
knnMatch
(
queryDescs
,
trainDescs
,
trainIdx
,
distance
,
allDist
,
k
,
mask
);
knnMatch
Single
(
query
,
train
,
trainIdx
,
distance
,
allDist
,
k
,
mask
);
knnMatchDownload
(
trainIdx
,
distance
,
matches
,
compactResult
);
knnMatchDownload
(
trainIdx
,
distance
,
matches
,
compactResult
);
}
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch2Collection
(
const
GpuMat
&
query
,
const
GpuMat
&
trainCollection
,
GpuMat
&
trainIdx
,
GpuMat
&
imgIdx
,
GpuMat
&
distance
,
const
GpuMat
&
maskCollection
,
Stream
&
stream
)
{
if
(
query
.
empty
()
||
trainCollection
.
empty
())
return
;
using
namespace
cv
::
gpu
::
bf_knnmatch
;
typedef
void
(
*
caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
trains
,
const
DevMem2D_
<
PtrStep
>&
masks
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
int
cc
,
cudaStream_t
stream
);
static
const
caller_t
callers
[
3
][
6
]
=
{
{
match2L1_gpu
<
unsigned
char
>
,
0
/*match2L1_gpu<signed char>*/
,
match2L1_gpu
<
unsigned
short
>
,
match2L1_gpu
<
short
>
,
match2L1_gpu
<
int
>
,
match2L1_gpu
<
float
>
},
{
0
/*match2L2_gpu<unsigned char>*/
,
0
/*match2L2_gpu<signed char>*/
,
0
/*match2L2_gpu<unsigned short>*/
,
0
/*match2L2_gpu<short>*/
,
0
/*match2L2_gpu<int>*/
,
match2L2_gpu
<
float
>
},
{
match2Hamming_gpu
<
unsigned
char
>
,
0
/*match2Hamming_gpu<signed char>*/
,
match2Hamming_gpu
<
unsigned
short
>
,
0
/*match2Hamming_gpu<short>*/
,
match2Hamming_gpu
<
int
>
,
0
/*match2Hamming_gpu<float>*/
}
};
CV_Assert
(
query
.
channels
()
==
1
&&
query
.
depth
()
<
CV_64F
);
const
int
nQuery
=
query
.
rows
;
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32SC2
,
trainIdx
);
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32SC2
,
imgIdx
);
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32FC2
,
distance
);
if
(
stream
)
stream
.
enqueueMemSet
(
trainIdx
,
Scalar
::
all
(
-
1
));
else
trainIdx
.
setTo
(
Scalar
::
all
(
-
1
));
caller_t
func
=
callers
[
distType
][
query
.
depth
()];
CV_Assert
(
func
!=
0
);
DeviceInfo
info
;
int
cc
=
info
.
majorVersion
()
*
10
+
info
.
minorVersion
();
func
(
query
,
trainCollection
,
maskCollection
,
trainIdx
,
imgIdx
,
distance
,
cc
,
StreamAccessor
::
getStream
(
stream
));
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch2Download
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
imgIdx
,
const
GpuMat
&
distance
,
vector
<
vector
<
DMatch
>
>&
matches
,
bool
compactResult
)
{
if
(
trainIdx
.
empty
()
||
imgIdx
.
empty
()
||
distance
.
empty
())
return
;
Mat
trainIdxCPU
=
trainIdx
;
Mat
imgIdxCPU
=
imgIdx
;
Mat
distanceCPU
=
distance
;
knnMatch2Convert
(
trainIdxCPU
,
imgIdxCPU
,
distanceCPU
,
matches
,
compactResult
);
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch2Convert
(
const
Mat
&
trainIdx
,
const
Mat
&
imgIdx
,
const
Mat
&
distance
,
vector
<
vector
<
DMatch
>
>&
matches
,
bool
compactResult
)
{
if
(
trainIdx
.
empty
()
||
imgIdx
.
empty
()
||
distance
.
empty
())
return
;
CV_Assert
(
trainIdx
.
type
()
==
CV_32SC2
);
CV_Assert
(
imgIdx
.
type
()
==
CV_32SC2
&&
imgIdx
.
cols
==
trainIdx
.
cols
);
CV_Assert
(
distance
.
type
()
==
CV_32FC2
&&
distance
.
cols
==
trainIdx
.
cols
);
const
int
nQuery
=
trainIdx
.
cols
;
matches
.
clear
();
matches
.
reserve
(
nQuery
);
const
int
*
trainIdx_ptr
=
trainIdx
.
ptr
<
int
>
();
const
int
*
imgIdx_ptr
=
imgIdx
.
ptr
<
int
>
();
const
float
*
distance_ptr
=
distance
.
ptr
<
float
>
();
for
(
int
queryIdx
=
0
;
queryIdx
<
nQuery
;
++
queryIdx
)
{
matches
.
push_back
(
vector
<
DMatch
>
());
vector
<
DMatch
>&
curMatches
=
matches
.
back
();
curMatches
.
reserve
(
2
);
for
(
int
i
=
0
;
i
<
2
;
++
i
,
++
trainIdx_ptr
,
++
imgIdx_ptr
,
++
distance_ptr
)
{
int
trainIdx
=
*
trainIdx_ptr
;
if
(
trainIdx
!=
-
1
)
{
int
imgIdx
=
*
imgIdx_ptr
;
float
distance
=
*
distance_ptr
;
DMatch
m
(
queryIdx
,
trainIdx
,
imgIdx
,
distance
);
curMatches
.
push_back
(
m
);
}
}
if
(
compactResult
&&
curMatches
.
empty
())
matches
.
pop_back
();
}
}
namespace
namespace
{
{
struct
ImgIdxSetter
struct
ImgIdxSetter
...
@@ -564,103 +693,123 @@ namespace
...
@@ -564,103 +693,123 @@ namespace
};
};
}
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch
(
const
GpuMat
&
query
Descs
,
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
knnMatch
(
const
GpuMat
&
query
,
vector
<
vector
<
DMatch
>
>&
matches
,
int
k
,
vector
<
vector
<
DMatch
>
>&
matches
,
int
knn
,
const
vector
<
GpuMat
>&
masks
,
bool
compactResult
)
const
vector
<
GpuMat
>&
masks
,
bool
compactResult
)
{
{
if
(
queryDescs
.
empty
()
||
empty
())
if
(
k
==
2
)
return
;
{
GpuMat
trainCollection
;
GpuMat
maskCollection
;
vector
<
vector
<
DMatch
>
>
curMatches
;
makeGpuCollection
(
trainCollection
,
maskCollection
,
masks
);
vector
<
DMatch
>
temp
;
temp
.
reserve
(
2
*
knn
);
matches
.
resize
(
queryDescs
.
rows
);
GpuMat
trainIdx
,
imgIdx
,
distance
;
for_each
(
matches
.
begin
(),
matches
.
end
(),
bind2nd
(
mem_fun_ref
(
&
vector
<
DMatch
>::
reserve
),
knn
));
for
(
size_t
imgIdx
=
0
;
imgIdx
<
trainDescCollection
.
size
();
++
imgIdx
)
knnMatch2Collection
(
query
,
trainCollection
,
trainIdx
,
imgIdx
,
distance
,
maskCollection
);
knnMatch2Download
(
trainIdx
,
imgIdx
,
distance
,
matches
);
}
else
{
{
knnMatch
(
queryDescs
,
trainDescCollection
[
imgIdx
],
curMatches
,
knn
,
if
(
query
.
empty
()
||
empty
())
masks
.
empty
()
?
GpuMat
()
:
masks
[
imgIdx
]);
return
;
vector
<
vector
<
DMatch
>
>
curMatches
;
vector
<
DMatch
>
temp
;
temp
.
reserve
(
2
*
k
);
matches
.
resize
(
query
.
rows
);
for_each
(
matches
.
begin
(),
matches
.
end
(),
bind2nd
(
mem_fun_ref
(
&
vector
<
DMatch
>::
reserve
),
k
));
for
(
int
queryIdx
=
0
;
queryIdx
<
queryDescs
.
rows
;
++
query
Idx
)
for
(
size_t
imgIdx
=
0
,
size
=
trainDescCollection
.
size
();
imgIdx
<
size
;
++
img
Idx
)
{
{
vector
<
DMatch
>&
localMatch
=
curMatches
[
queryIdx
];
knnMatch
(
query
,
trainDescCollection
[
imgIdx
],
curMatches
,
k
,
masks
.
empty
()
?
GpuMat
()
:
masks
[
imgIdx
]);
vector
<
DMatch
>&
globalMatch
=
matches
[
queryIdx
];
for_each
(
localMatch
.
begin
(),
localMatch
.
end
(),
ImgIdxSetter
(
static_cast
<
int
>
(
imgIdx
)));
for
(
int
queryIdx
=
0
;
queryIdx
<
query
.
rows
;
++
queryIdx
)
{
vector
<
DMatch
>&
localMatch
=
curMatches
[
queryIdx
];
vector
<
DMatch
>&
globalMatch
=
matches
[
queryIdx
];
for_each
(
localMatch
.
begin
(),
localMatch
.
end
(),
ImgIdxSetter
(
static_cast
<
int
>
(
imgIdx
)));
temp
.
clear
();
temp
.
clear
();
merge
(
globalMatch
.
begin
(),
globalMatch
.
end
(),
localMatch
.
begin
(),
localMatch
.
end
(),
back_inserter
(
temp
));
merge
(
globalMatch
.
begin
(),
globalMatch
.
end
(),
localMatch
.
begin
(),
localMatch
.
end
(),
back_inserter
(
temp
));
globalMatch
.
clear
();
globalMatch
.
clear
();
const
size_t
count
=
std
::
min
((
size_t
)
knn
,
temp
.
size
());
const
size_t
count
=
std
::
min
((
size_t
)
k
,
temp
.
size
());
copy
(
temp
.
begin
(),
temp
.
begin
()
+
count
,
back_inserter
(
globalMatch
));
copy
(
temp
.
begin
(),
temp
.
begin
()
+
count
,
back_inserter
(
globalMatch
));
}
}
}
}
if
(
compactResult
)
if
(
compactResult
)
{
{
vector
<
vector
<
DMatch
>
>::
iterator
new_end
=
remove_if
(
matches
.
begin
(),
matches
.
end
(),
vector
<
vector
<
DMatch
>
>::
iterator
new_end
=
remove_if
(
matches
.
begin
(),
matches
.
end
(),
mem_fun_ref
(
&
vector
<
DMatch
>::
empty
));
m
em_fun_ref
(
&
vector
<
DMatch
>::
empty
));
m
atches
.
erase
(
new_end
,
matches
.
end
(
));
matches
.
erase
(
new_end
,
matches
.
end
());
}
}
}
}
}
////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////
// RadiusMatch
// RadiusMatch
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchSingle
(
const
GpuMat
&
queryDescs
,
const
GpuMat
&
trainDescs
,
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchSingle
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
GpuMat
&
trainIdx
,
GpuMat
&
distance
,
GpuMat
&
nMatches
,
float
maxDistance
,
const
GpuMat
&
mask
,
Stream
&
stream
)
GpuMat
&
trainIdx
,
GpuMat
&
distance
,
GpuMat
&
nMatches
,
float
maxDistance
,
const
GpuMat
&
mask
,
Stream
&
stream
)
{
{
if
(
query
Descs
.
empty
()
||
trainDescs
.
empty
())
if
(
query
.
empty
()
||
train
.
empty
())
return
;
return
;
using
namespace
cv
::
gpu
::
bf_radius_match
;
using
namespace
cv
::
gpu
::
bf_radius_match
;
typedef
void
(
*
radiusMatch_
caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
float
maxDistance
,
const
DevMem2D
&
mask
,
typedef
void
(
*
caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
float
maxDistance
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
nMatches
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Df
&
distance
,
const
DevMem2D_
<
unsigned
int
>
&
nMatches
,
cudaStream_t
stream
);
int
cc
,
cudaStream_t
stream
);
static
const
radiusMatch_caller_t
radiusMatch_callers
[
3
][
8
]
=
static
const
caller_t
callers
[
3
][
6
]
=
{
{
{
{
radiusMatchSingleL1_gpu
<
unsigned
char
>
,
0
/*radiusMatchSingleL1_gpu<signed char>*/
,
radiusMatchSingleL1_gpu
<
unsigned
short
>
,
matchL1_gpu
<
unsigned
char
>
,
0
/*matchL1_gpu<signed char>*/
,
radiusMatchSingleL1_gpu
<
short
>
,
radiusMatchSingleL1_gpu
<
int
>
,
radiusMatchSingleL1_gpu
<
float
>
,
0
,
0
matchL1_gpu
<
unsigned
short
>
,
matchL1_gpu
<
short
>
,
matchL1_gpu
<
int
>
,
matchL1_gpu
<
float
>
},
},
{
{
0
/*radiusMatchSingleL2_gpu<unsigned char>*/
,
0
/*radiusMatchSingleL2_gpu<signed char>*/
,
0
/*radiusMatchSingleL2_gpu<unsigned short>*/
,
0
/*matchL2_gpu<unsigned char>*/
,
0
/*matchL2_gpu<signed char>*/
,
0
/*radiusMatchSingleL2_gpu<short>*/
,
0
/*radiusMatchSingleL2_gpu<int>*/
,
radiusMatchSingleL2_gpu
<
float
>
,
0
,
0
0
/*matchL2_gpu<unsigned short>*/
,
0
/*matchL2_gpu<short>*/
,
0
/*matchL2_gpu<int>*/
,
matchL2_gpu
<
float
>
},
},
{
{
radiusMatchSingleHamming_gpu
<
unsigned
char
>
,
0
/*radiusMatchSingleHamming_gpu<signed char>*/
,
radiusMatchSingleHamming_gpu
<
unsigned
short
>
,
matchHamming_gpu
<
unsigned
char
>
,
0
/*matchHamming_gpu<signed char>*/
,
0
/*radiusMatchSingleHamming_gpu<short>*/
,
radiusMatchSingleHamming_gpu
<
int
>
,
0
,
0
,
0
matchHamming_gpu
<
unsigned
short
>
,
0
/*matchHamming_gpu<short>*/
,
matchHamming_gpu
<
int
>
,
0
/*matchHamming_gpu<float>*/
}
}
};
};
CV_Assert
(
TargetArchs
::
builtWith
(
SHARED_ATOMICS
)
&&
DeviceInfo
().
supports
(
SHARED_ATOMICS
));
DeviceInfo
info
;
int
cc
=
info
.
majorVersion
()
*
10
+
info
.
minorVersion
();
const
int
nQuery
=
queryDescs
.
rows
;
CV_Assert
(
TargetArchs
::
builtWith
(
GLOBAL_ATOMICS
)
&&
info
.
supports
(
GLOBAL_ATOMICS
));
const
int
nTrain
=
trainDescs
.
rows
;
CV_Assert
(
queryDescs
.
channels
()
==
1
&&
queryDescs
.
depth
()
<
CV_64F
);
const
int
nQuery
=
query
.
rows
;
CV_Assert
(
trainDescs
.
type
()
==
queryDescs
.
type
()
&&
trainDescs
.
cols
==
queryDescs
.
cols
);
const
int
nTrain
=
train
.
rows
;
CV_Assert
(
query
.
channels
()
==
1
&&
query
.
depth
()
<
CV_64F
);
CV_Assert
(
train
.
type
()
==
query
.
type
()
&&
train
.
cols
==
query
.
cols
);
CV_Assert
(
trainIdx
.
empty
()
||
(
trainIdx
.
rows
==
nQuery
&&
trainIdx
.
size
()
==
distance
.
size
()));
CV_Assert
(
trainIdx
.
empty
()
||
(
trainIdx
.
rows
==
nQuery
&&
trainIdx
.
size
()
==
distance
.
size
()));
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32SC1
,
nMatches
);
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32SC1
,
nMatches
);
if
(
trainIdx
.
empty
())
if
(
trainIdx
.
empty
())
{
{
ensureSizeIsEnough
(
nQuery
,
nTrain
/
2
,
CV_32SC1
,
trainIdx
);
ensureSizeIsEnough
(
nQuery
,
std
::
max
((
nTrain
/
100
),
10
)
,
CV_32SC1
,
trainIdx
);
ensureSizeIsEnough
(
nQuery
,
nTrain
/
2
,
CV_32FC1
,
distance
);
ensureSizeIsEnough
(
nQuery
,
std
::
max
((
nTrain
/
100
),
10
)
,
CV_32FC1
,
distance
);
}
}
radiusMatch_caller_t
func
=
radiusMatch_callers
[
distType
][
queryDescs
.
depth
()];
caller_t
func
=
callers
[
distType
][
query
.
depth
()];
CV_Assert
(
func
!=
0
);
CV_Assert
(
func
!=
0
);
func
(
query
Descs
,
trainDescs
,
maxDistance
,
mask
,
trainIdx
,
distance
,
nMatches
,
StreamAccessor
::
getStream
(
stream
));
func
(
query
,
train
,
maxDistance
,
mask
,
trainIdx
,
distance
,
nMatches
,
cc
,
StreamAccessor
::
getStream
(
stream
));
}
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchDownload
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
distance
,
const
GpuMat
&
nMatches
,
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchDownload
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
distance
,
const
GpuMat
&
nMatches
,
vector
<
vector
<
DMatch
>
>&
matches
,
bool
compactResult
)
vector
<
vector
<
DMatch
>
>&
matches
,
bool
compactResult
)
{
{
if
(
trainIdx
.
empty
()
||
distance
.
empty
()
||
nMatches
.
empty
())
if
(
trainIdx
.
empty
()
||
distance
.
empty
()
||
nMatches
.
empty
())
return
;
return
;
...
@@ -673,14 +822,14 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trai
...
@@ -673,14 +822,14 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trai
}
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchConvert
(
const
Mat
&
trainIdx
,
const
Mat
&
distance
,
const
Mat
&
nMatches
,
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchConvert
(
const
Mat
&
trainIdx
,
const
Mat
&
distance
,
const
Mat
&
nMatches
,
vector
<
vector
<
DMatch
>
>&
matches
,
bool
compactResult
)
vector
<
vector
<
DMatch
>
>&
matches
,
bool
compactResult
)
{
{
if
(
trainIdx
.
empty
()
||
distance
.
empty
()
||
nMatches
.
empty
())
if
(
trainIdx
.
empty
()
||
distance
.
empty
()
||
nMatches
.
empty
())
return
;
return
;
CV_Assert
(
trainIdx
.
type
()
==
CV_32SC1
);
CV_Assert
(
trainIdx
.
type
()
==
CV_32SC1
);
CV_Assert
(
nMatches
.
type
()
==
CV_32SC1
&&
nMatches
.
isContinuous
()
&&
nMatches
.
cols
>=
trainIdx
.
rows
);
CV_Assert
(
distance
.
type
()
==
CV_32FC1
&&
distance
.
size
()
==
trainIdx
.
size
());
CV_Assert
(
distance
.
type
()
==
CV_32FC1
&&
distance
.
size
()
==
trainIdx
.
size
());
CV_Assert
(
nMatches
.
type
()
==
CV_32SC1
&&
nMatches
.
cols
==
trainIdx
.
rows
);
const
int
nQuery
=
trainIdx
.
rows
;
const
int
nQuery
=
trainIdx
.
rows
;
...
@@ -688,6 +837,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
...
@@ -688,6 +837,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
matches
.
reserve
(
nQuery
);
matches
.
reserve
(
nQuery
);
const
int
*
nMatches_ptr
=
nMatches
.
ptr
<
int
>
();
const
int
*
nMatches_ptr
=
nMatches
.
ptr
<
int
>
();
for
(
int
queryIdx
=
0
;
queryIdx
<
nQuery
;
++
queryIdx
)
for
(
int
queryIdx
=
0
;
queryIdx
<
nQuery
;
++
queryIdx
)
{
{
const
int
*
trainIdx_ptr
=
trainIdx
.
ptr
<
int
>
(
queryIdx
);
const
int
*
trainIdx_ptr
=
trainIdx
.
ptr
<
int
>
(
queryIdx
);
...
@@ -720,66 +870,75 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
...
@@ -720,66 +870,75 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
}
}
}
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatch
(
const
GpuMat
&
query
Descs
,
const
GpuMat
&
trainDescs
,
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatch
(
const
GpuMat
&
query
,
const
GpuMat
&
train
,
vector
<
vector
<
DMatch
>
>&
matches
,
float
maxDistance
,
const
GpuMat
&
mask
,
bool
compactResult
)
vector
<
vector
<
DMatch
>
>&
matches
,
float
maxDistance
,
const
GpuMat
&
mask
,
bool
compactResult
)
{
{
GpuMat
trainIdx
,
distance
,
nMatches
;
GpuMat
trainIdx
,
distance
,
nMatches
;
radiusMatchSingle
(
query
Descs
,
trainDescs
,
trainIdx
,
distance
,
nMatches
,
maxDistance
,
mask
);
radiusMatchSingle
(
query
,
train
,
trainIdx
,
distance
,
nMatches
,
maxDistance
,
mask
);
radiusMatchDownload
(
trainIdx
,
distance
,
nMatches
,
matches
,
compactResult
);
radiusMatchDownload
(
trainIdx
,
distance
,
nMatches
,
matches
,
compactResult
);
}
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchCollection
(
const
GpuMat
&
queryDescs
,
const
GpuMat
&
trainCollection
,
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchCollection
(
const
GpuMat
&
query
,
GpuMat
&
trainIdx
,
GpuMat
&
imgIdx
,
GpuMat
&
distance
,
GpuMat
&
nMatches
,
GpuMat
&
trainIdx
,
GpuMat
&
imgIdx
,
GpuMat
&
distance
,
GpuMat
&
nMatches
,
float
maxDistance
,
float
maxDistance
,
const
vector
<
GpuMat
>&
masks
,
Stream
&
stream
)
const
GpuMat
&
maskCollection
,
Stream
&
stream
)
{
{
if
(
query
Descs
.
empty
()
||
trainCollection
.
empty
())
if
(
query
.
empty
()
||
empty
())
return
;
return
;
using
namespace
cv
::
gpu
::
bf_radius_match
;
using
namespace
cv
::
gpu
::
bf_radius_match
;
typedef
void
(
*
radiusMatch_caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
trainCollection
,
float
maxDistance
,
const
DevMem2D_
<
PtrStep
>&
maskCollection
,
typedef
void
(
*
caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
*
trains
,
int
n
,
float
maxDistance
,
const
DevMem2D
*
masks
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
nMatches
,
const
DevMem2Di
&
trainIdx
,
const
DevMem2Di
&
imgIdx
,
const
DevMem2Df
&
distance
,
const
DevMem2D_
<
unsigned
int
>
&
nMatches
,
cudaStream_t
stream
);
int
cc
,
cudaStream_t
stream
);
static
const
radiusMatch_caller_t
radiusMatch_callers
[
3
][
8
]
=
static
const
caller_t
callers
[
3
][
6
]
=
{
{
{
{
radiusMatchCollectionL1_gpu
<
unsigned
char
>
,
0
/*radiusMatchCollectionL1_gpu<signed char>*/
,
radiusMatchCollectionL1_gpu
<
unsigned
short
>
,
matchL1_gpu
<
unsigned
char
>
,
0
/*matchL1_gpu<signed char>*/
,
radiusMatchCollectionL1_gpu
<
short
>
,
radiusMatchCollectionL1_gpu
<
int
>
,
radiusMatchCollectionL1_gpu
<
float
>
,
0
,
0
matchL1_gpu
<
unsigned
short
>
,
matchL1_gpu
<
short
>
,
matchL1_gpu
<
int
>
,
matchL1_gpu
<
float
>
},
},
{
{
0
/*radiusMatchCollectionL2_gpu<unsigned char>*/
,
0
/*radiusMatchCollectionL2_gpu<signed char>*/
,
0
/*radiusMatchCollectionL2_gpu<unsigned short>*/
,
0
/*matchL2_gpu<unsigned char>*/
,
0
/*matchL2_gpu<signed char>*/
,
0
/*radiusMatchCollectionL2_gpu<short>*/
,
0
/*radiusMatchCollectionL2_gpu<int>*/
,
radiusMatchCollectionL2_gpu
<
float
>
,
0
,
0
0
/*matchL2_gpu<unsigned short>*/
,
0
/*matchL2_gpu<short>*/
,
0
/*matchL2_gpu<int>*/
,
matchL2_gpu
<
float
>
},
},
{
{
radiusMatchCollectionHamming_gpu
<
unsigned
char
>
,
0
/*radiusMatchCollectionHamming_gpu<signed char>*/
,
radiusMatchCollectionHamming_gpu
<
unsigned
short
>
,
matchHamming_gpu
<
unsigned
char
>
,
0
/*matchHamming_gpu<signed char>*/
,
0
/*radiusMatchCollectionHamming_gpu<short>*/
,
radiusMatchCollectionHamming_gpu
<
int
>
,
0
,
0
,
0
matchHamming_gpu
<
unsigned
short
>
,
0
/*matchHamming_gpu<short>*/
,
matchHamming_gpu
<
int
>
,
0
/*matchHamming_gpu<float>*/
}
}
};
};
CV_Assert
(
TargetArchs
::
builtWith
(
SHARED_ATOMICS
)
&&
DeviceInfo
().
supports
(
SHARED_ATOMICS
));
DeviceInfo
info
;
int
cc
=
info
.
majorVersion
()
*
10
+
info
.
minorVersion
();
const
int
nQuery
=
queryDescs
.
rows
;
CV_Assert
(
TargetArchs
::
builtWith
(
GLOBAL_ATOMICS
)
&&
info
.
supports
(
GLOBAL_ATOMICS
))
;
CV_Assert
(
queryDescs
.
channels
()
==
1
&&
queryDescs
.
depth
()
<
CV_64F
);
const
int
nQuery
=
query
.
rows
;
CV_Assert
(
query
.
channels
()
==
1
&&
query
.
depth
()
<
CV_64F
);
CV_Assert
(
trainIdx
.
empty
()
||
(
trainIdx
.
rows
==
nQuery
&&
trainIdx
.
size
()
==
distance
.
size
()
&&
trainIdx
.
size
()
==
imgIdx
.
size
()));
CV_Assert
(
trainIdx
.
empty
()
||
(
trainIdx
.
rows
==
nQuery
&&
trainIdx
.
size
()
==
distance
.
size
()
&&
trainIdx
.
size
()
==
imgIdx
.
size
()));
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32SC1
,
nMatches
);
ensureSizeIsEnough
(
1
,
nQuery
,
CV_32SC1
,
nMatches
);
if
(
trainIdx
.
empty
())
if
(
trainIdx
.
empty
())
{
{
ensureSizeIsEnough
(
nQuery
,
nQuery
/
2
,
CV_32SC1
,
trainIdx
);
ensureSizeIsEnough
(
nQuery
,
std
::
max
((
nQuery
/
100
),
10
)
,
CV_32SC1
,
trainIdx
);
ensureSizeIsEnough
(
nQuery
,
nQuery
/
2
,
CV_32SC1
,
imgIdx
);
ensureSizeIsEnough
(
nQuery
,
std
::
max
((
nQuery
/
100
),
10
)
,
CV_32SC1
,
imgIdx
);
ensureSizeIsEnough
(
nQuery
,
nQuery
/
2
,
CV_32FC1
,
distance
);
ensureSizeIsEnough
(
nQuery
,
std
::
max
((
nQuery
/
100
),
10
)
,
CV_32FC1
,
distance
);
}
}
radiusMatch_caller_t
func
=
radiusMatch_callers
[
distType
][
queryDescs
.
depth
()];
caller_t
func
=
callers
[
distType
][
query
.
depth
()];
CV_Assert
(
func
!=
0
);
CV_Assert
(
func
!=
0
);
func
(
queryDescs
,
trainCollection
,
maxDistance
,
maskCollection
,
trainIdx
,
imgIdx
,
distance
,
nMatches
,
StreamAccessor
::
getStream
(
stream
));
vector
<
DevMem2D
>
trains_
(
trainDescCollection
.
begin
(),
trainDescCollection
.
end
());
vector
<
DevMem2D
>
masks_
(
masks
.
begin
(),
masks
.
end
());
func
(
query
,
&
trains_
[
0
],
static_cast
<
int
>
(
trains_
.
size
()),
maxDistance
,
masks_
.
size
()
==
0
?
0
:
&
masks_
[
0
],
trainIdx
,
imgIdx
,
distance
,
nMatches
,
cc
,
StreamAccessor
::
getStream
(
stream
));
}
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchDownload
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
imgIdx
,
const
GpuMat
&
distance
,
const
GpuMat
&
nMatches
,
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatchDownload
(
const
GpuMat
&
trainIdx
,
const
GpuMat
&
imgIdx
,
const
GpuMat
&
distance
,
const
GpuMat
&
nMatches
,
vector
<
vector
<
DMatch
>
>&
matches
,
bool
compactResult
)
vector
<
vector
<
DMatch
>
>&
matches
,
bool
compactResult
)
{
{
if
(
trainIdx
.
empty
()
||
imgIdx
.
empty
()
||
distance
.
empty
()
||
nMatches
.
empty
())
if
(
trainIdx
.
empty
()
||
imgIdx
.
empty
()
||
distance
.
empty
()
||
nMatches
.
empty
())
return
;
return
;
...
@@ -801,7 +960,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
...
@@ -801,7 +960,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
CV_Assert
(
trainIdx
.
type
()
==
CV_32SC1
);
CV_Assert
(
trainIdx
.
type
()
==
CV_32SC1
);
CV_Assert
(
imgIdx
.
type
()
==
CV_32SC1
&&
imgIdx
.
size
()
==
trainIdx
.
size
());
CV_Assert
(
imgIdx
.
type
()
==
CV_32SC1
&&
imgIdx
.
size
()
==
trainIdx
.
size
());
CV_Assert
(
distance
.
type
()
==
CV_32FC1
&&
distance
.
size
()
==
trainIdx
.
size
());
CV_Assert
(
distance
.
type
()
==
CV_32FC1
&&
distance
.
size
()
==
trainIdx
.
size
());
CV_Assert
(
nMatches
.
type
()
==
CV_32SC1
&&
nMatches
.
isContinuous
()
&&
nMatches
.
cols
>
=
trainIdx
.
rows
);
CV_Assert
(
nMatches
.
type
()
==
CV_32SC1
&&
nMatches
.
cols
=
=
trainIdx
.
rows
);
const
int
nQuery
=
trainIdx
.
rows
;
const
int
nQuery
=
trainIdx
.
rows
;
...
@@ -809,6 +968,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
...
@@ -809,6 +968,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
matches
.
reserve
(
nQuery
);
matches
.
reserve
(
nQuery
);
const
int
*
nMatches_ptr
=
nMatches
.
ptr
<
int
>
();
const
int
*
nMatches_ptr
=
nMatches
.
ptr
<
int
>
();
for
(
int
queryIdx
=
0
;
queryIdx
<
nQuery
;
++
queryIdx
)
for
(
int
queryIdx
=
0
;
queryIdx
<
nQuery
;
++
queryIdx
)
{
{
const
int
*
trainIdx_ptr
=
trainIdx
.
ptr
<
int
>
(
queryIdx
);
const
int
*
trainIdx_ptr
=
trainIdx
.
ptr
<
int
>
(
queryIdx
);
...
@@ -843,18 +1003,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
...
@@ -843,18 +1003,11 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchConvert(const Mat& trainIdx
}
}
}
}
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatch
(
const
GpuMat
&
query
Descs
,
vector
<
vector
<
DMatch
>
>&
matches
,
void
cv
::
gpu
::
BruteForceMatcher_GPU_base
::
radiusMatch
(
const
GpuMat
&
query
,
vector
<
vector
<
DMatch
>
>&
matches
,
float
maxDistance
,
const
vector
<
GpuMat
>&
masks
,
bool
compactResult
)
float
maxDistance
,
const
vector
<
GpuMat
>&
masks
,
bool
compactResult
)
{
{
GpuMat
trainCollection
;
GpuMat
maskCollection
;
makeGpuCollection
(
trainCollection
,
maskCollection
,
masks
);
GpuMat
trainIdx
,
imgIdx
,
distance
,
nMatches
;
GpuMat
trainIdx
,
imgIdx
,
distance
,
nMatches
;
radiusMatchCollection
(
query
,
trainIdx
,
imgIdx
,
distance
,
nMatches
,
maxDistance
,
masks
);
radiusMatchCollection
(
queryDescs
,
trainCollection
,
trainIdx
,
imgIdx
,
distance
,
nMatches
,
maxDistance
,
maskCollection
);
radiusMatchDownload
(
trainIdx
,
imgIdx
,
distance
,
nMatches
,
matches
,
compactResult
);
radiusMatchDownload
(
trainIdx
,
imgIdx
,
distance
,
nMatches
,
matches
,
compactResult
);
}
}
...
...
modules/gpu/src/cuda/bf_knnmatch.cu
View file @
d3c4e907
...
@@ -49,153 +49,677 @@ using namespace cv::gpu::device;
...
@@ -49,153 +49,677 @@ using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace bf_knnmatch
namespace cv { namespace gpu { namespace bf_knnmatch
{
{
template <typename VecDiff, typename Dist, typename T, typename Mask>
///////////////////////////////////////////////////////////////////////////////
__device__ void distanceCalcLoop(const PtrStep_<T>& query, const DevMem2D_<T>& train, const Mask& m, int queryIdx,
// Reduction
typename Dist::result_type& distMin1, typename Dist::result_type& distMin2, int& bestTrainIdx1, int& bestTrainIdx2,
typename Dist::result_type* smem)
template <int BLOCK_SIZE>
__device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
int& bestTrainIdx1, int& bestTrainIdx2,
float* s_distance, int* s_trainIdx)
{
{
const VecDiff vecDiff(query.ptr(queryIdx), train.cols, (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x);
float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max();
typename Dist::result_type* sdiffRow = smem + blockDim.x * threadIdx.y;
int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1;
distMin1 = numeric_limits<typename Dist::result_type>::max();
distMin2 = numeric_limits<typename Dist::result_type>::max();
s_distance += threadIdx.y * BLOCK_SIZE;
s_trainIdx += threadIdx.y * BLOCK_SIZE;
s_distance[threadIdx.x] = bestDistance1;
s_trainIdx[threadIdx.x] = bestTrainIdx1;
__syncthreads();
if (threadIdx.x == 0)
{
#pragma unroll
for (int i = 0; i < BLOCK_SIZE; ++i)
{
float val = s_distance[i];
if (val < myBestDistance1)
{
myBestDistance2 = myBestDistance1;
myBestTrainIdx2 = myBestTrainIdx1;
myBestDistance1 = val;
myBestTrainIdx1 = s_trainIdx[i];
}
else if (val < myBestDistance2)
{
myBestDistance2 = val;
myBestTrainIdx2 = s_trainIdx[i];
}
}
}
__syncthreads();
bestTrainIdx1 = -1
;
s_distance[threadIdx.x] = bestDistance2
;
bestTrainIdx2 = -1
;
s_trainIdx[threadIdx.x] = bestTrainIdx2
;
for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y)
__syncthreads();
if (threadIdx.x == 0)
{
{
if (m(queryIdx, trainIdx))
#pragma unroll
for (int i = 0; i < BLOCK_SIZE; ++i)
{
float val = s_distance[i];
if (val < myBestDistance2)
{
myBestDistance2 = val;
myBestTrainIdx2 = s_trainIdx[i];
}
}
}
bestDistance1 = myBestDistance1;
bestDistance2 = myBestDistance2;
bestTrainIdx1 = myBestTrainIdx1;
bestTrainIdx2 = myBestTrainIdx2;
}
template <int BLOCK_SIZE>
__device__ void findBestMatch(float& bestDistance1, float& bestDistance2,
int& bestTrainIdx1, int& bestTrainIdx2,
int& bestImgIdx1, int& bestImgIdx2,
float* s_distance, int* s_trainIdx, int* s_imgIdx)
{
float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1;
int myBestImgIdx1 = -1;
int myBestImgIdx2 = -1;
s_distance += threadIdx.y * BLOCK_SIZE;
s_trainIdx += threadIdx.y * BLOCK_SIZE;
s_imgIdx += threadIdx.y * BLOCK_SIZE;
s_distance[threadIdx.x] = bestDistance1;
s_trainIdx[threadIdx.x] = bestTrainIdx1;
s_imgIdx[threadIdx.x] = bestImgIdx1;
__syncthreads();
if (threadIdx.x == 0)
{
#pragma unroll
for (int i = 0; i < BLOCK_SIZE; ++i)
{
{
Dist dist;
float val = s_distance[i];
if (val < myBestDistance1)
{
myBestDistance2 = myBestDistance1;
myBestTrainIdx2 = myBestTrainIdx1;
myBestImgIdx2 = myBestImgIdx1;
myBestDistance1 = val;
myBestTrainIdx1 = s_trainIdx[i];
myBestImgIdx1 = s_imgIdx[i];
}
else if (val < myBestDistance2)
{
myBestDistance2 = val;
myBestTrainIdx2 = s_trainIdx[i];
myBestImgIdx2 = s_imgIdx[i];
}
}
}
__syncthreads();
s_distance[threadIdx.x] = bestDistance2;
s_trainIdx[threadIdx.x] = bestTrainIdx2;
s_imgIdx[threadIdx.x] = bestImgIdx2;
__syncthreads();
if (threadIdx.x == 0)
{
#pragma unroll
for (int i = 0; i < BLOCK_SIZE; ++i)
{
float val = s_distance[i];
if (val < myBestDistance2)
{
myBestDistance2 = val;
myBestTrainIdx2 = s_trainIdx[i];
myBestImgIdx2 = s_imgIdx[i];
}
}
}
bestDistance1 = myBestDistance1;
bestDistance2 = myBestDistance2;
bestTrainIdx1 = myBestTrainIdx1;
bestTrainIdx2 = myBestTrainIdx2;
bestImgIdx1 = myBestImgIdx1;
bestImgIdx2 = myBestImgIdx2;
}
///////////////////////////////////////////////////////////////////////////////
// Match Unrolled Cached
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
__device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)
{
#pragma unroll
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(min(queryIdx, query.rows - 1))[loadX] : 0;
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
typename Dist::value_type* s_query, typename Dist::value_type* s_train,
float& bestDistance1, float& bestDistance2,
int& bestTrainIdx1, int& bestTrainIdx2,
int& bestImgIdx1, int& bestImgIdx2)
{
for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
{
Dist dist;
#pragma unroll
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = loadX < train.cols ? train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX] : 0;
__syncthreads();
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
}
const T* trainRow = train.ptr(trainIdx);
typename Dist::result_type distVal = dist;
vecDiff.calc(trainRow, train.cols, dist, sdiffRow, threadIdx.x);
const typename Dist::result_type val = dist
;
const int trainIdx = t * BLOCK_SIZE + threadIdx.x
;
if (val < distMin1)
if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
{
if (distVal < bestDistance1)
{
{
distMin1 = val;
bestImgIdx2 = bestImgIdx1;
bestDistance2 = bestDistance1;
bestTrainIdx2 = bestTrainIdx1;
bestImgIdx1 = imgIdx;
bestDistance1 = distVal;
bestTrainIdx1 = trainIdx;
bestTrainIdx1 = trainIdx;
}
}
else if (
val < distMin
2)
else if (
distVal < bestDistance
2)
{
{
distMin2 = val;
bestImgIdx2 = imgIdx;
bestDistance2 = distVal;
bestTrainIdx2 = trainIdx;
bestTrainIdx2 = trainIdx;
}
}
}
}
}
}
}
}
template <int BLOCK_
DIM_X, int BLOCK_DIM_Y, typename VecDiff, typename Dist, typename T, typename Mask>
template <int BLOCK_
SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void
knnMatch2(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask m, int2* trainIdx, float2* d
istance)
__global__ void
matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestD
istance)
{
{
typedef typename Dist::result_type result_type;
extern __shared__ int smem[];
typedef typename Dist::value_type value_type;
__shared__ result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y]
;
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y
;
const int queryIdx = blockIdx.x;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
result_type distMin1;
loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
result_type distMin2;
int bestTrainIdx1;
float myBestDistance1 = numeric_limits<float>::max();
int bestTrainIdx2;
float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1;
loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
distanceCalcLoop<VecDiff, Dist>(query, train, m, queryIdx, distMin1, distMin2, bestTrainIdx1, bestTrainIdx2, smem);
__syncthreads();
__syncthreads();
volatile result_type* sdistMinRow = smem
;
float* s_distance = (float*)(smem)
;
volatile int* sbestTrainIdxRow = (int*)(sdistMinRow + 2 * BLOCK_DIM_Y
);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE
);
if (threadIdx.x == 0)
findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
{
sdistMinRow[threadIdx.y] = distMin1;
bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
sdistMinRow[threadIdx.y + BLOCK_DIM_Y] = distMin2;
bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1;
int myBestImgIdx1 = -1;
int myBestImgIdx2 = -1;
sbestTrainIdxRow[threadIdx.y] = bestTrainIdx1;
Mask m = mask;
sbestTrainIdxRow[threadIdx.y + BLOCK_DIM_Y] = bestTrainIdx2;
for (int imgIdx = 0; imgIdx < n; ++imgIdx)
{
const DevMem2D_<T> train = trains[imgIdx];
m.next();
loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
}
}
__syncthreads();
__syncthreads();
if (threadIdx.x == 0 && threadIdx.y == 0)
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
{
distMin1 = numeric_limits<result_type>::max();
bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
distMin2 = numeric_limits<result_type>::max();
bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
bestTrainIdx1 = -1;
const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
bestTrainIdx2 = -1;
matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
// Match Unrolled
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
typename Dist::value_type* s_query, typename Dist::value_type* s_train,
float& bestDistance1, float& bestDistance2,
int& bestTrainIdx1, int& bestTrainIdx2,
int& bestImgIdx1, int& bestImgIdx2)
{
for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
{
Dist dist;
#pragma unroll
#pragma unroll
for (int i = 0; i <
BLOCK_DIM_Y
; ++i)
for (int i = 0; i <
MAX_DESC_LEN / BLOCK_SIZE
; ++i)
{
{
result_type val = sdistMinRow[i]
;
const int loadX = threadIdx.x + i * BLOCK_SIZE
;
if (
val < distMin1
)
if (
loadX < query.cols
)
{
{
distMin1 = val
;
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX]
;
bestTrainIdx1 = sbestTrainIdxRow[i
];
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX
];
}
}
else
if (val < distMin2)
else
{
{
distMin2 = val
;
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0
;
bestTrainIdx2 = sbestTrainIdxRow[i]
;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0
;
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
}
}
#pragma unroll
typename Dist::result_type distVal = dist;
for (int i = BLOCK_DIM_Y; i < 2 * BLOCK_DIM_Y; ++i)
const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
{
{
result_type val = sdistMinRow[i];
if (distVal < bestDistance1)
{
bestImgIdx2 = bestImgIdx1;
bestDistance2 = bestDistance1;
bestTrainIdx2 = bestTrainIdx1;
if (val < distMin2)
bestImgIdx1 = imgIdx;
bestDistance1 = distVal;
bestTrainIdx1 = trainIdx;
}
else if (distVal < bestDistance2)
{
{
distMin2 = val;
bestImgIdx2 = imgIdx;
bestTrainIdx2 = sbestTrainIdxRow[i];
bestDistance2 = distVal;
bestTrainIdx2 = trainIdx;
}
}
}
}
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1;
loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
__syncthreads();
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1;
int myBestImgIdx1 = -1;
int myBestImgIdx2 = -1;
Mask m = mask;
for (int imgIdx = 0; imgIdx < n; ++imgIdx)
{
const DevMem2D_<T> train = trains[imgIdx];
m.next();
loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
}
__syncthreads();
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
trainIdx[queryIdx] = make_int2(bestTrainIdx1, bestTrainIdx2);
if (queryIdx < query.rows && threadIdx.x == 0)
distance[queryIdx] = make_float2(distMin1, distMin2);
{
bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
}
}
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Knn 2 Match kernel caller
// Match
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
typename Dist::value_type* s_query, typename Dist::value_type* s_train,
float& bestDistance1, float& bestDistance2,
int& bestTrainIdx1, int& bestTrainIdx2,
int& bestImgIdx1, int& bestImgIdx2)
{
for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
{
Dist dist;
for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
if (loadX < query.cols)
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX];
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
}
else
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
}
__syncthreads();
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
}
typename Dist::result_type distVal = dist;
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
void knnMatch2Simple_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx))
cudaStream_t stream)
{
if (distVal < bestDistance1)
{
bestImgIdx2 = bestImgIdx1;
bestDistance2 = bestDistance1;
bestTrainIdx2 = bestTrainIdx1;
bestImgIdx1 = imgIdx;
bestDistance1 = distVal;
bestTrainIdx1 = trainIdx;
}
else if (distVal < bestDistance2)
{
bestImgIdx2 = imgIdx;
bestDistance2 = distVal;
bestTrainIdx2 = trainIdx;
}
}
}
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__global__ void match(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
{
{
const dim3 grid(query.rows, 1, 1);
extern __shared__ int smem[];
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T>
float myBestDistance1 = numeric_limits<float>::max();
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx, distance);
float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1;
loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestTrainIdx1, myBestTrainIdx2);
__syncthreads();
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, s_distance, s_trainIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
}
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
}
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Mask>
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
void knnMatch2Cached_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
__global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
const DevMem2D_<int2>& trainIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream)
{
{
StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check(); // block size must be greter than descriptors length
extern __shared__ int smem[];
StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check(); // max descriptors length must divide to blockDimX
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
const dim3 grid(query.rows, 1, 1
);
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem
);
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1
);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE
);
knnMatch2<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T>
float myBestDistance1 = numeric_limits<float>::max();
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, distance.data);
float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1;
int myBestTrainIdx2 = -1;
int myBestImgIdx1 = -1;
int myBestImgIdx2 = -1;
Mask m = mask;
for (int imgIdx = 0; imgIdx < n; ++imgIdx)
{
const DevMem2D_<T> train = trains[imgIdx];
m.next();
loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2);
}
__syncthreads();
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance1, myBestDistance2, myBestTrainIdx1, myBestTrainIdx2, myBestImgIdx1, myBestImgIdx2, s_distance, s_trainIdx, s_imgIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
bestTrainIdx[queryIdx] = make_int2(myBestTrainIdx1, myBestTrainIdx2);
bestImgIdx[queryIdx] = make_int2(myBestImgIdx1, myBestImgIdx2);
bestDistance[queryIdx] = make_float2(myBestDistance1, myBestDistance2);
}
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2D_<int2>& trainIdx, const DevMem2D_<int2>& imgIdx, const DevMem2D_<float2>& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
if (stream == 0)
...
@@ -203,142 +727,254 @@ namespace cv { namespace gpu { namespace bf_knnmatch
...
@@ -203,142 +727,254 @@ namespace cv { namespace gpu { namespace bf_knnmatch
}
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
//
Knn 2 Match D
ispatcher
//
knnMatch 2 d
ispatcher
template <typename Dist, typename T, typename Mask>
template <typename Dist, typename T, typename Mask>
void
knnM
atch2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
void
m
atch2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2D& trainIdx, const DevMem2D& distance,
const DevMem2D& trainIdx, const DevMem2D& distance,
int cc, cudaStream_t stream)
int cc, cudaStream_t stream)
{
{
if (query.cols < 64)
if (query.cols <
=
64)
{
{
knnMatch2Cached_caller<16, 16, 64, false, Dist>(
matchUnrolledCached<16, 64, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
}
}
else if (query.cols
== 64
)
else if (query.cols
<= 128
)
{
{
knnMatch2Cached_caller<16, 16, 64, true, Dist>(
matchUnrolledCached<16, 128, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
}
}
else if (query.cols <
128
)
else if (query.cols <
= 256
)
{
{
knnMatch2Cached_caller<16, 16, 128, false, Dist>(
matchUnrolled<16, 256, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
}
}
else if (query.cols == 128 && cc >= 12)
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
}
else
{
match<16, Dist>(query, train, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> > (distance), stream);
}
}
template <typename Dist, typename T, typename Mask>
void match2Dispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
int cc, cudaStream_t stream)
{
if (query.cols <= 64)
{
{
knnMatch2Cached_caller<16, 16, 128, true, Dist>(
matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
}
}
else if (query.cols <
256 && cc >= 12
)
else if (query.cols <
= 128
)
{
{
knnMatch2Cached_caller<16, 16, 256, false, Dist>(
matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
}
}
else if (query.cols
== 256 && cc >= 12
)
else if (query.cols
<= 256
)
{
{
knnMatch2Cached_caller<16, 16, 256, true, Dist>(
matchUnrolled<16, 256, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
query, train, mask,
}
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
else if (query.cols <= 512)
stream);
{
matchUnrolled<16, 512, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
}
}
else
else
{
{
knnMatch2Simple_caller<16, 16, Dist>(
match<16, Dist>(query, trains, n, mask, static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<int2> >(imgIdx), static_cast< DevMem2D_<float2> > (distance), stream);
query, train, mask,
static_cast< DevMem2D_<int2> >(trainIdx), static_cast< DevMem2D_<float2> >(distance),
stream);
}
}
}
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Calc distance kernel
// Calc distance kernel
template <int BLOCK_
DIM_X, int BLOCK_DIM_Y
, typename Dist, typename T, typename Mask>
template <int BLOCK_
SIZE, int MAX_DESC_LEN
, typename Dist, typename T, typename Mask>
__global__ void calcDistance
(const PtrStep_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf distance
)
__global__ void calcDistance
Unrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist
)
{
{
__shared__ typename Dist::result_type sdiff[BLOCK_DIM_X * BLOCK_DIM_Y
];
extern __shared__ int smem[
];
typename Dist::result_type* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
const int queryIdx = blockIdx.x;
const T* queryDescs = query.ptr(queryIdx);
const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
if (trainIdx < train.rows)
Dist dist;
#pragma unroll
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
{
{
const T* trainDescs = train.ptr(trainIdx);
const int loadX = threadIdx.x + i * BLOCK_SIZE;
if (loadX < query.cols)
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX];
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
}
else
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
}
typename Dist::result_type myDist = numeric_limits<typename Dist::result_type>::max();
__syncthreads();
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
}
if (queryIdx < query.rows && trainIdx < train.rows)
{
float distVal = numeric_limits<float>::max();
if (mask(queryIdx, trainIdx))
if (mask(queryIdx, trainIdx))
{
distVal = (typename Dist::result_type)dist;
Dist dist;
allDist.ptr(queryIdx)[trainIdx] = distVal;
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void calcDistanceUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
calcDistanceUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__global__ void calcDistance(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, PtrStepf allDist)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
calcVecDiffGlobal<BLOCK_DIM_X>(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.x)
;
Dist dist
;
myDist = dist;
for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
if (loadX < query.cols)
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX];
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
}
}
else
if (threadIdx.x == 0)
{
distance.ptr(queryIdx)[trainIdx] = myDist;
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
}
__syncthreads();
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
}
}
}
///////////////////////////////////////////////////////////////////////////////
if (queryIdx < query.rows && trainIdx < train.rows)
// Calc distance kernel caller
{
float distVal = numeric_limits<float>::max();
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
if (mask(queryIdx, trainIdx))
void calcDistance_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& distance, cudaStream_t stream)
distVal = (typename Dist::result_type)dist;
allDist.ptr(queryIdx)[trainIdx] = distVal;
}
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
void calcDistance(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2Df& allDist, cudaStream_t stream)
{
{
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1);
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
calcDistance<BLOCK_
DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(query, train, mask, distance
);
calcDistance<BLOCK_
SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, allDist
);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
}
template <typename Dist, typename T, typename Mask>
///////////////////////////////////////////////////////////////////////////////
void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask, const DevMem2D& allDist, cudaStream_t stream)
// Calc Distance dispatcher
template <typename Dist, typename T, typename Mask>
void calcDistanceDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2Df& allDist,
int cc, cudaStream_t stream)
{
{
calcDistance_caller<16, 16, Dist>(query, train, mask, static_cast<DevMem2Df>(allDist), stream);
if (query.cols <= 64)
{
calcDistanceUnrolled<16, 64, Dist>(query, train, mask, allDist, stream);
}
else if (query.cols <= 128)
{
calcDistanceUnrolled<16, 128, Dist>(query, train, mask, allDist, stream);
}
else if (query.cols <= 256)
{
calcDistanceUnrolled<16, 256, Dist>(query, train, mask, allDist, stream);
}
else if (query.cols <= 512)
{
calcDistanceUnrolled<16, 512, Dist>(query, train, mask, allDist, stream);
}
else if (query.cols <= 1024)
{
calcDistanceUnrolled<16, 1024, Dist>(query, train, mask, allDist, stream);
}
else
{
calcDistance<16, Dist>(query, train, mask, allDist, stream);
}
}
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// find knn match kernel
// find knn match kernel
template <int BLOCK_SIZE> __global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_)
template <int BLOCK_SIZE>
__global__ void findBestMatch(DevMem2Df allDist, int i, PtrStepi trainIdx, PtrStepf distance)
{
{
const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
__shared__ float sdist[SMEM_SIZE];
__shared__ float s
_
dist[SMEM_SIZE];
__shared__ int strainIdx[SMEM_SIZE];
__shared__ int s
_
trainIdx[SMEM_SIZE];
const int queryIdx = blockIdx.x;
const int queryIdx = blockIdx.x;
float* allDist = allDist_.ptr(queryIdx);
float* allDistRow = allDist.ptr(queryIdx);
int* trainIdx = trainIdx_.ptr(queryIdx);
float* distance = distance_.ptr(queryIdx);
float dist = numeric_limits<float>::max();
float dist = numeric_limits<float>::max();
int bestIdx = -1;
int bestIdx = -1;
for (int i = threadIdx.x; i < allDist
_
.cols; i += BLOCK_SIZE)
for (int i = threadIdx.x; i < allDist.cols; i += BLOCK_SIZE)
{
{
float reg = allDist[i];
float reg = allDist
Row
[i];
if (reg < dist)
if (reg < dist)
{
{
dist = reg;
dist = reg;
...
@@ -346,34 +982,32 @@ namespace cv { namespace gpu { namespace bf_knnmatch
...
@@ -346,34 +982,32 @@ namespace cv { namespace gpu { namespace bf_knnmatch
}
}
}
}
sdist[threadIdx.x] = dist;
s
_
dist[threadIdx.x] = dist;
strainIdx[threadIdx.x] = bestIdx;
s
_
trainIdx[threadIdx.x] = bestIdx;
__syncthreads();
__syncthreads();
reducePredVal<BLOCK_SIZE>(s
dist, dist, s
trainIdx, bestIdx, threadIdx.x, less<volatile float>());
reducePredVal<BLOCK_SIZE>(s
_dist, dist, s_
trainIdx, bestIdx, threadIdx.x, less<volatile float>());
if (threadIdx.x == 0)
if (threadIdx.x == 0)
{
{
if (dist < numeric_limits<float>::max())
if (dist < numeric_limits<float>::max())
{
{
allDist[bestIdx] = numeric_limits<float>::max();
allDist
Row
[bestIdx] = numeric_limits<float>::max();
trainIdx[i] = bestIdx;
trainIdx
.ptr(queryIdx)
[i] = bestIdx;
distance[i] = dist;
distance
.ptr(queryIdx)
[i] = dist;
}
}
}
}
}
}
///////////////////////////////////////////////////////////////////////////////
// find knn match kernel caller
template <int BLOCK_SIZE> void findKnnMatch_caller(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
template <int BLOCK_SIZE>
void findKnnMatch(int k, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist, cudaStream_t stream)
{
{
const dim3
threads
(BLOCK_SIZE, 1, 1);
const dim3
block
(BLOCK_SIZE, 1, 1);
const dim3 grid(trainIdx.rows, 1, 1);
const dim3 grid(trainIdx.rows, 1, 1);
for (int i = 0; i < k; ++i)
for (int i = 0; i < k; ++i)
{
{
findBestMatch<BLOCK_SIZE><<<grid,
threads
, 0, stream>>>(allDist, i, trainIdx, distance);
findBestMatch<BLOCK_SIZE><<<grid,
block
, 0, stream>>>(allDist, i, trainIdx, distance);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaGetLastError() );
}
}
...
@@ -381,84 +1015,130 @@ namespace cv { namespace gpu { namespace bf_knnmatch
...
@@ -381,84 +1015,130 @@ namespace cv { namespace gpu { namespace bf_knnmatch
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
}
void findKnnMatchDispatcher(int k, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist
, cudaStream_t stream)
void findKnnMatchDispatcher(int k, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
f& allDist, int cc
, cudaStream_t stream)
{
{
findKnnMatch
_caller<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), static_cast<DevMem2Df>(allDist)
, stream);
findKnnMatch
<256>(k, static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), allDist
, stream);
}
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// knn match Dispatcher
// knn match Dispatcher
template <typename Dist, typename T>
template <typename Dist, typename T
, typename Mask
>
void
knnMatchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const DevMem2D
& mask,
void
matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, int k, const Mask
& mask,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
f
& allDist,
int cc, cudaStream_t stream)
int cc, cudaStream_t stream)
{
{
if (
mask.data
)
if (
k == 2
)
{
{
if (k == 2)
match2Dispatcher<Dist>(query, train, mask, trainIdx, distance, cc, stream);
{
knnMatch2Dispatcher<Dist>(query, train, SingleMask(mask), trainIdx, distance, cc, stream);
return;
}
calcDistanceDispatcher<Dist>(query, train, SingleMask(mask), allDist, stream);
}
}
else
else
{
{
if (k == 2)
calcDistanceDispatcher<Dist>(query, train, mask, allDist, cc, stream);
{
findKnnMatchDispatcher(k, trainIdx, distance, allDist, cc, stream);
knnMatch2Dispatcher<Dist>(query, train, WithOutMask(), trainIdx, distance, cc, stream);
return;
}
calcDistanceDispatcher<Dist>(query, train, WithOutMask(), allDist, stream);
}
}
}
findKnnMatchDispatcher(k, trainIdx, distance, allDist, stream);
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// knn match caller
// knn match caller
template <typename T> void
knnM
atchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
template <typename T> void
m
atchL1_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
f
& allDist,
int cc, cudaStream_t stream)
int cc, cudaStream_t stream)
{
{
knnMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
if (mask.data)
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
else
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
}
}
template void
knnMatchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
template void
matchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
//template void
knnMatchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
//template void
matchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
template void
knnMatchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
template void
matchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
template void
knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
template void
matchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
template void
knnMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
template void
matchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
template void
knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
template void
matchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
template <typename T> void
knnM
atchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
template <typename T> void
m
atchL2_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
f
& allDist,
int cc, cudaStream_t stream)
int cc, cudaStream_t stream)
{
{
knnMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
if (mask.data)
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
else
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
}
}
//template void
knnMatchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
//template void
matchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
//template void
knnMatchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
//template void
matchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
//template void
knnMatchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
//template void
matchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
//template void
knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
//template void
matchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
//template void
knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
//template void
matchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
template void
knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
& allDist, int cc, cudaStream_t stream);
template void
matchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df
& allDist, int cc, cudaStream_t stream);
template <typename T> void
knnM
atchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
template <typename T> void
m
atchHamming_gpu(const DevMem2D& query, const DevMem2D& train, int k, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist,
const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D
f
& allDist,
int cc, cudaStream_t stream)
int cc, cudaStream_t stream)
{
{
knnMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, mask, trainIdx, distance, allDist, cc, stream);
if (mask.data)
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, SingleMask(mask), trainIdx, distance, allDist, cc, stream);
else
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), k, WithOutMask(), trainIdx, distance, allDist, cc, stream);
}
}
template void knnMatchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void matchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
//template void knnMatchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
//template void matchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
template void knnMatchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void matchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
//template void knnMatchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
//template void matchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
template void knnMatchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& allDist, int cc, cudaStream_t stream);
template void matchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int k, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2Df& allDist, int cc, cudaStream_t stream);
template <typename T> void match2L1_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks,
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
int cc, cudaStream_t stream)
{
if (masks.data)
match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
else
match2Dispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
}
template void match2L1_gpu<uchar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//template void match2L1_gpu<schar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void match2L1_gpu<ushort>(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void match2L1_gpu<short >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void match2L1_gpu<int >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void match2L1_gpu<float >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template <typename T> void match2L2_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks,
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
int cc, cudaStream_t stream)
{
if (masks.data)
match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
else
match2Dispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
}
//template void match2L2_gpu<uchar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//template void match2L2_gpu<schar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//template void match2L2_gpu<ushort>(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//template void match2L2_gpu<short >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//template void match2L2_gpu<int >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2Di& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void match2L2_gpu<float >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template <typename T> void match2Hamming_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks,
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
int cc, cudaStream_t stream)
{
if (masks.data)
match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), trainIdx, imgIdx, distance, cc, stream);
else
match2Dispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
}
template void match2Hamming_gpu<uchar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//template void match2Hamming_gpu<schar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void match2Hamming_gpu<ushort>(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
//template void match2Hamming_gpu<short >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
template void match2Hamming_gpu<int >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, int cc, cudaStream_t stream);
}}}
}}}
modules/gpu/src/cuda/bf_match.cu
View file @
d3c4e907
...
@@ -49,355 +49,715 @@ using namespace cv::gpu::device;
...
@@ -49,355 +49,715 @@ using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace bf_match
namespace cv { namespace gpu { namespace bf_match
{
{
template <int BLOCK_DIM_Y, typename T>
///////////////////////////////////////////////////////////////////////////////
__device__ void findBestMatch(T& myDist, int2& myIdx, T* smin, int2* sIdx)
// Reduction
template <int BLOCK_SIZE>
__device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx)
{
{
if (threadIdx.x == 0)
s_distance += threadIdx.y * BLOCK_SIZE;
{
s_trainIdx += threadIdx.y * BLOCK_SIZE;
smin[threadIdx.y] = myDist;
sIdx[threadIdx.y] = myIdx;
s_distance[threadIdx.x] = bestDistance;
}
s_trainIdx[threadIdx.x] = bestTrainIdx;
__syncthreads();
reducePredVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<volatile float>());
}
template <int BLOCK_SIZE>
__device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx)
{
s_distance += threadIdx.y * BLOCK_SIZE;
s_trainIdx += threadIdx.y * BLOCK_SIZE;
s_imgIdx += threadIdx.y * BLOCK_SIZE;
s_distance[threadIdx.x] = bestDistance;
s_trainIdx[threadIdx.x] = bestTrainIdx;
s_imgIdx [threadIdx.x] = bestImgIdx;
__syncthreads();
__syncthreads();
reducePredVal
<BLOCK_DIM_Y>(smin, myDist, sIdx, myIdx, threadIdx.y * blockDim.x + threadIdx.x, less<volatile T
>());
reducePredVal
2<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less<volatile float
>());
}
}
template <typename Dist, typename VecDiff, typename T, typename Mask>
///////////////////////////////////////////////////////////////////////////////
__device__ void matchDescs(int queryIdx, int imgIdx, const DevMem2D_<T>& train, const Mask& m, const VecDiff& vecDiff,
// Match Unrolled Cached
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row)
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
__device__ void loadQueryToSmem(int queryIdx, const DevMem2D_<T>& query, U* s_query)
{
#pragma unroll
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(min(queryIdx, query.rows - 1))[loadX] : 0;
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__device__ void loopUnrolledCached(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
typename Dist::value_type* s_query, typename Dist::value_type* s_train,
float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
{
{
for (int t
rainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += blockDim.y
)
for (int t
= 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t
)
{
{
if (m(queryIdx, trainIdx))
Dist dist;
#pragma unroll
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
{
{
const
T* trainDescs = train.ptr(trainIdx)
;
const
int loadX = threadIdx.x + i * BLOCK_SIZE
;
Dist dist
;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = loadX < train.cols ? train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX] : 0
;
vecDiff.calc(trainDescs, train.cols, dist, sdiff_row, threadIdx.x
);
__syncthreads(
);
const typename Dist::result_type res = dist;
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
if (res < myDist)
__syncthreads();
{
}
myDist = res;
myIdx.x = trainIdx;
typename Dist::result_type distVal = dist;
myIdx.y = imgIdx;
}
const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
{
bestImgIdx = imgIdx;
bestDistance = distVal;
bestTrainIdx = trainIdx;
}
}
}
}
}
}
template <typename T> struct SingleTrain
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
{
{
explicit SingleTrain(const DevMem2D_<T>& train_) : train(train_)
extern __shared__ int smem[];
{
}
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
float myBestDistance = numeric_limits<float>::max();
int myBestTrainIdx = -1;
loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
__syncthreads();
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
template <typename Dist, typename VecDiff, typename Mask>
if (queryIdx < query.rows && threadIdx.x == 0)
__device__ __forceinline__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff,
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const
{
{
matchDescs<Dist>(queryIdx, 0, train, m, vecDiff, myDist, myIdx, sdiff_row);
bestTrainIdx[queryIdx] = myBestTrainIdx;
bestDistance[queryIdx] = myBestDistance;
}
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolledCached(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask,
int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
__device__ __forceinline__ int desc_len() const
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
float myBestDistance = numeric_limits<float>::max();
int myBestTrainIdx = -1;
int myBestImgIdx = -1;
Mask m = mask;
for (int imgIdx = 0; imgIdx < n; ++imgIdx)
{
{
return train.cols;
const DevMem2D_<T> train = trains[imgIdx];
m.next();
loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
}
}
static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx,
__syncthreads();
float myDist, const int2& myIdx, int queryIdx)
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
{
trainIdx[queryIdx] = myIdx.x;
bestTrainIdx[queryIdx] = myBestTrainIdx;
distance[queryIdx] = myDist;
bestImgIdx[queryIdx] = myBestImgIdx;
bestDistance[queryIdx] = myBestDistance;
}
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolledCached(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
const DevMem2D_<T> train;
matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
};
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
// Match Unrolled
template <typename T> struct TrainCollection
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__device__ void loopUnrolled(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
typename Dist::value_type* s_query, typename Dist::value_type* s_train,
float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
{
{
TrainCollection(const DevMem2D_<T>* trainCollection_, int nImg_, int desclen_) :
for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_)
{
{
}
Dist dist;
template <typename Dist, typename VecDiff, typename Mask>
#pragma unroll
__device__ void loop(int queryIdx, Mask& m, const VecDiff& vecDiff,
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* sdiff_row) const
{
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
for (int imgIdx = 0; imgIdx < nImg; ++imgIdx)
if (loadX < query.cols)
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX];
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
}
else
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
}
__syncthreads();
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
}
typename Dist::result_type distVal = dist;
const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
{
{
const DevMem2D_<T> train = trainCollection[imgIdx]
;
bestImgIdx = imgIdx
;
m.next()
;
bestDistance = distVal
;
matchDescs<Dist>(queryIdx, imgIdx, train, m, vecDiff, myDist, myIdx, sdiff_row)
;
bestTrainIdx = trainIdx
;
}
}
}
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
float myBestDistance = numeric_limits<float>::max();
int myBestTrainIdx = -1;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
__syncthreads();
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
bestTrainIdx[queryIdx] = myBestTrainIdx;
bestDistance[queryIdx] = myBestDistance;
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
__device__ __forceinline__ int desc_len() const
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__global__ void matchUnrolled(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask,
int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
float myBestDistance = numeric_limits<float>::max();
int myBestTrainIdx = -1;
int myBestImgIdx = -1;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
Mask m = mask;
for (int imgIdx = 0; imgIdx < n; ++imgIdx)
{
{
return desclen;
const DevMem2D_<T> train = trains[imgIdx];
m.next();
loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
}
}
static __device__ __forceinline__ void storeResult(float* distance, int* trainIdx, int* imgIdx,
__syncthreads();
float myDist, const int2& myIdx, int queryIdx)
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
{
trainIdx[queryIdx] = myIdx.
x;
bestTrainIdx[queryIdx] = myBestTrainId
x;
imgIdx[queryIdx] = myIdx.y
;
bestImgIdx[queryIdx] = myBestImgIdx
;
distance[queryIdx] = myDist
;
bestDistance[queryIdx] = myBestDistance
;
}
}
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
const DevMem2D_<T>* trainCollection;
///////////////////////////////////////////////////////////////////////////////
const int nImg;
// Match
const int desclen;
};
template <typename VecDiff, typename Dist, typename T, typename Train, typename Mask>
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__device__ void distanceCalcLoop(const PtrStep_<T>& query, const Train& train, const Mask& mask, int queryIdx,
__device__ void loop(int queryIdx, const DevMem2D_<T>& query, int imgIdx, const DevMem2D_<T>& train, const Mask& mask,
typename Dist::result_type& myDist, int2& myIdx, typename Dist::result_type* smem)
typename Dist::value_type* s_query, typename Dist::value_type* s_train,
float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
{
{
const VecDiff vecDiff(query.ptr(queryIdx), train.desc_len(), (typename Dist::value_type*)smem, threadIdx.y * blockDim.x + threadIdx.x, threadIdx.x);
for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
{
typename Dist::result_type* sdiff_row = smem + blockDim.x * threadIdx.y
;
Dist dist
;
Mask m = mask;
for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
if (loadX < query.cols)
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX];
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
}
else
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
}
__syncthreads();
#pragma unroll
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__syncthreads();
}
myIdx.x = -1;
typename Dist::result_type distVal = dist;
myIdx.y = -1;
myDist = numeric_limits<typename Dist::result_type>::max();
train.template loop<Dist>(queryIdx, m, vecDiff, myDist, myIdx, sdiff_row);
const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
{
bestImgIdx = imgIdx;
bestDistance = distVal;
bestTrainIdx = trainIdx;
}
}
}
}
template <int BLOCK_
DIM_X, int BLOCK_DIM_Y, typename VecDiff, typename Dist, typename T, typename Train
, typename Mask>
template <int BLOCK_
SIZE, typename Dist, typename T
, typename Mask>
__global__ void match(const
PtrStep_<T> query, const Train train, const Mask mask, int* trainIdx, int* imgIdx, float* d
istance)
__global__ void match(const
DevMem2D_<T> query, const DevMem2D_<T> train, const Mask mask, int* bestTrainIdx, float* bestD
istance)
{
{
__shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];
extern __shared__ int smem[];
const int queryIdx = blockIdx.x;
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
float myBestDistance = numeric_limits<float>::max();
int myBestTrainIdx = -1;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
int2 myIdx;
loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
typename Dist::result_type myDist;
distanceCalcLoop<VecDiff, Dist>(query, train, mask, queryIdx, myDist, myIdx, smem);
__syncthreads();
__syncthreads();
typename Dist::result_type* smin = smem
;
float* s_distance = (float*)(smem)
;
int
2* sIdx = (int2*)(smin + BLOCK_DIM_Y
);
int
* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE
);
findBestMatch<BLOCK_
DIM_Y>(myDist, myIdx, smin, s
Idx);
findBestMatch<BLOCK_
SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_train
Idx);
if (threadIdx.x == 0 && threadIdx.y == 0)
if (queryIdx < query.rows && threadIdx.x == 0)
Train::storeResult(distance, trainIdx, imgIdx, myDist, myIdx, queryIdx);
{
bestTrainIdx[queryIdx] = myBestTrainIdx;
bestDistance[queryIdx] = myBestDistance;
}
}
}
///////////////////////////////////////////////////////////////////////////////
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
// Match kernel caller
void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance,
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Train, typename Mask>
cudaStream_t stream)
void matchSimple_caller(const DevMem2D_<T>& query, const Train& train, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
cudaStream_t stream)
{
{
StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
const dim3 grid(query.rows, 1, 1);
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
match<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T>
match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
}
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Train, typename Mask>
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
void matchCached_caller(const DevMem2D_<T>& query, const Train& train, const Mask& mask,
__global__ void match(const DevMem2D_<T> query, const DevMem2D_<T>* trains, int n, const Mask mask,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
cudaStream_t stream)
{
extern __shared__ int smem[];
const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
float myBestDistance = numeric_limits<float>::max();
int myBestTrainIdx = -1;
int myBestImgIdx = -1;
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
Mask m = mask;
for (int imgIdx = 0; imgIdx < n; ++imgIdx)
{
const DevMem2D_<T> train = trains[imgIdx];
m.next();
loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
}
__syncthreads();
float* s_distance = (float*)(smem);
int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
if (queryIdx < query.rows && threadIdx.x == 0)
{
bestTrainIdx[queryIdx] = myBestTrainIdx;
bestImgIdx[queryIdx] = myBestImgIdx;
bestDistance[queryIdx] = myBestDistance;
}
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
void match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
cudaStream_t stream)
{
{
StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check(); // block size must be greter than descriptors length
const dim3 grid(divUp(query.rows, BLOCK_SIZE));
StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check(); // max descriptors length must divide to blockDimX
const dim3 grid(query.rows, 1, 1);
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
match<BLOCK_DIM_X, BLOCK_DIM_Y, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T>
match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
<<<grid, threads, 0, stream>>>(query, train, mask, trainIdx.data, imgIdx.data, distance.data);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Match
D
ispatcher
// Match
d
ispatcher
template <typename Dist, typename T, typename
Train, typename Mask>
template <typename Dist, typename T, typename
Mask>
void matchDispatcher(const DevMem2D_<T>& query, const
Train
& train, const Mask& mask,
void matchDispatcher(const DevMem2D_<T>& query, const
DevMem2D_<T>
& train, const Mask& mask,
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance,
const DevMem2Di& trainIdx, const DevMem2Df& distance,
int cc, cudaStream_t stream)
int cc, cudaStream_t stream)
{
{
if (query.cols < 64)
if (query.cols <
=
64)
{
{
matchCached_caller<16, 16, 64, false, Dist>(
matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);
query, train, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
stream);
}
}
else if (query.cols
== 64
)
else if (query.cols
<= 128
)
{
{
matchCached_caller<16, 16, 64, true, Dist>(
matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream);
query, train, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
stream);
}
}
else if (query.cols <
128
)
else if (query.cols <
= 256
)
{
{
matchCached_caller<16, 16, 128, false, Dist>(
matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream);
query, train, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
stream);
}
}
else if (query.cols == 128 && cc >= 12)
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream);
}
else
{
{
matchCached_caller<16, 16, 128, true, Dist>(
match<16, Dist>(query, train, mask, trainIdx, distance, stream);
query, train, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
stream);
}
}
else if (query.cols < 256 && cc >= 12)
}
template <typename Dist, typename T, typename Mask>
void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
int cc, cudaStream_t stream)
{
if (query.cols <= 64)
{
{
matchCached_caller<16, 16, 256, false, Dist>(
matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
query, train, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
stream);
}
}
else if (query.cols
== 256 && cc >= 12
)
else if (query.cols
<= 128
)
{
{
matchCached_caller<16, 16, 256, true, Dist>(
matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
query, train, mask,
}
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
else if (query.cols <= 256)
stream);
{
matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
}
else if (query.cols <= 512)
{
matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
}
}
else
else
{
{
matchSimple_caller<16, 16, Dist>(
match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
query, train, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance),
stream);
}
}
}
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Match caller
// Match caller
template <typename T> void match
SingleL1_gpu(const DevMem2D& query, const DevMem2D& train_
, const DevMem2D& mask,
template <typename T> void match
L1_gpu(const DevMem2D& query, const DevMem2D& train
, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D
& distance,
const DevMem2Di& trainIdx, const DevMem2Df
& distance,
int cc, cudaStream_t stream)
int cc, cudaStream_t stream)
{
{
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
if (mask.data)
if (mask.data)
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream);
{
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask),
trainIdx, distance,
cc, stream);
}
else
else
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream);
{
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(),
trainIdx, distance,
cc, stream);
}
}
}
template void match
SingleL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
L1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
SingleL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
L1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
SingleL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
L1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
SingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
L1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
SingleL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
L1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
SingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
L1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template <typename T> void match
SingleL2_gpu(const DevMem2D& query, const DevMem2D& train_
, const DevMem2D& mask,
template <typename T> void match
L2_gpu(const DevMem2D& query, const DevMem2D& train
, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D
& distance,
const DevMem2Di& trainIdx, const DevMem2Df
& distance,
int cc, cudaStream_t stream)
int cc, cudaStream_t stream)
{
{
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
if (mask.data)
if (mask.data)
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream);
{
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask),
trainIdx, distance,
cc, stream);
}
else
else
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream);
{
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(),
trainIdx, distance,
cc, stream);
}
}
}
//template void match
SingleL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
L2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
SingleL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
L2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
SingleL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
L2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
SingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
L2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
SingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
L2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
SingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
L2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template <typename T> void match
SingleHamming_gpu(const DevMem2D& query, const DevMem2D& train_
, const DevMem2D& mask,
template <typename T> void match
Hamming_gpu(const DevMem2D& query, const DevMem2D& train
, const DevMem2D& mask,
const DevMem2D& trainIdx, const DevMem2D
& distance,
const DevMem2Di& trainIdx, const DevMem2Df
& distance,
int cc, cudaStream_t stream)
int cc, cudaStream_t stream)
{
{
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
if (mask.data)
if (mask.data)
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, SingleMask(mask), trainIdx, DevMem2D(), distance, cc, stream);
{
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), SingleMask(mask),
trainIdx, distance,
cc, stream);
}
else
else
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, DevMem2D(), distance, cc, stream);
{
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), WithOutMask(),
trainIdx, distance,
cc, stream);
}
}
}
template void match
SingleHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
Hamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
SingleHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
Hamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
SingleHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
Hamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
SingleHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
Hamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
SingleHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
Hamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template <typename T> void match
CollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection
,
template <typename T> void match
L1_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks
,
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance,
int cc, cudaStream_t stream)
int cc, cudaStream_t stream)
{
{
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
if (masks.data)
if (maskCollection.data)
{
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream);
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
trainIdx, imgIdx, distance,
cc, stream);
}
else
else
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
{
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(),
trainIdx, imgIdx, distance,
cc, stream);
}
}
}
template void match
CollectionL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
L1_gpu<uchar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
CollectionL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
L1_gpu<schar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
CollectionL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
L1_gpu<ushort>(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
CollectionL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
L1_gpu<short >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
CollectionL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
L1_gpu<int >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
CollectionL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
L1_gpu<float >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template <typename T> void match
CollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection
,
template <typename T> void match
L2_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks
,
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance,
int cc, cudaStream_t stream)
int cc, cudaStream_t stream)
{
{
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
if (masks.data)
if (maskCollection.data)
{
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream);
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
trainIdx, imgIdx, distance,
cc, stream);
}
else
else
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
{
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(),
trainIdx, imgIdx, distance,
cc, stream);
}
}
}
//template void match
CollectionL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
L2_gpu<uchar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
CollectionL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
L2_gpu<schar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
CollectionL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
L2_gpu<ushort>(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
CollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
L2_gpu<short >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
CollectionL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
L2_gpu<int >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
CollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
L2_gpu<float >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template <typename T> void match
CollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection
,
template <typename T> void match
Hamming_gpu(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks
,
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance,
int cc, cudaStream_t stream)
int cc, cudaStream_t stream)
{
{
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
if (masks.data)
if (maskCollection.data)
{
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, MaskCollection(maskCollection.data), trainIdx, imgIdx, distance, cc, stream);
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
trainIdx, imgIdx, distance,
cc, stream);
}
else
else
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, WithOutMask(), trainIdx, imgIdx, distance, cc, stream);
{
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains.ptr(), trains.cols, WithOutMask(),
trainIdx, imgIdx, distance,
cc, stream);
}
}
}
template void match
CollectionHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
Hamming_gpu<uchar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
CollectionHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
Hamming_gpu<schar >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
CollectionHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
Hamming_gpu<ushort>(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
//template void match
CollectionHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
//template void match
Hamming_gpu<short >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
template void match
CollectionHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D
& distance, int cc, cudaStream_t stream);
template void match
Hamming_gpu<int >(const DevMem2D& query, const DevMem2D& trains, const DevMem2D_<PtrStep>& masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df
& distance, int cc, cudaStream_t stream);
}}}
}}}
modules/gpu/src/cuda/bf_radius_match.cu
View file @
d3c4e907
...
@@ -49,466 +49,410 @@ using namespace cv::gpu::device;
...
@@ -49,466 +49,410 @@ using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace bf_radius_match
namespace cv { namespace gpu { namespace bf_radius_match
{
{
template <typename T> struct SingleTrain
///////////////////////////////////////////////////////////////////////////////
// Match Unrolled
template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
__global__ void matchUnrolled(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{
{
enum {USE_IMG_IDX = 0};
#if __CUDA_ARCH__ >= 110
explicit SingleTrain(const DevMem2D_<T>& train_) : train(train_)
extern __shared__ int smem[];
{
}
static __device__ __forceinline__ void store(const int* s_trainIdx, const int* s_imgIdx, const float* s_dist, unsigned int& s_count, int& s_globInd,
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
int* trainIdx, int* imgIdx, float* distance, int maxCount)
const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
{
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if (tid < s_count && s_globInd + tid < maxCount)
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
{
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
trainIdx[s_globInd + tid] = s_trainIdx[tid];
distance[s_globInd + tid] = s_dist[tid];
}
if (tid == 0)
Dist dist;
{
s_globInd += s_count;
s_count = 0;
}
}
template <int BLOCK_STACK, typename Dist, typename VecDiff, typename Mask>
#pragma unroll
__device__ __forceinline__ void loop(float maxDistance, Mask& mask, const VecDiff& vecDiff,
for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
int* s_trainIdx, int* s_imgIdx, float* s_dist, unsigned int& s_count, int& s_globInd,
int* trainIdxRow, int* imgIdxRow, float* distanceRow, int maxCount,
typename Dist::result_type* s_diffRow) const
{
{
#if __CUDA_ARCH__ >= 120
const int loadX = threadIdx.x + i * BLOCK_SIZE;
for (int i = 0; i < train.rows; i += blockDim.y
)
if (loadX < query.cols
)
{
{
int trainIdx = i + threadIdx.y;
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX];
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
if (trainIdx < train.rows && mask(blockIdx.x, trainIdx))
}
{
else
Dist dist;
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
vecDiff.calc(train.ptr(trainIdx), train.cols, dist, s_diffRow, threadIdx.x);
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
}
const typename Dist::result_type val = dist
;
__syncthreads()
;
if (threadIdx.x == 0 && val < maxDistance)
#pragma unroll
{
for (int j = 0; j < BLOCK_SIZE; ++j)
unsigned int ind = atomicInc(&s_count, (unsigned int) -1);
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
s_trainIdx[ind] = trainIdx;
s_dist[ind] = val;
}
}
__syncthreads();
__syncthreads();
}
if (s_count >= BLOCK_STACK - blockDim.y)
float distVal = (typename Dist::result_type)dist;
store(s_trainIdx, s_imgIdx, s_dist, s_count, s_globInd, trainIdxRow, imgIdxRow, distanceRow, maxCount);
__syncthreads();
if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
{
unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
if (ind < maxCount)
{
bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
bestDistance.ptr(queryIdx)[ind] = distVal;
}
}
}
store(s_trainIdx, s_imgIdx, s_dist, s_count, s_globInd, trainIdxRow, imgIdxRow, distanceRow, maxCount);
#endif
}
#endif
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
}
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, cudaStream_t stream)
{
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
__device__ __forceinline__ int descLen() const
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
{
return train.cols;
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
}
trainIdx, PtrStepi(), distance, nMatches.data, trainIdx.cols);
cudaSafeCall( cudaGetLastError() );
const DevMem2D_<T> train;
if (stream == 0)
};
cudaSafeCall( cudaDeviceSynchronize() );
}
template <typename T> struct TrainCollection
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T>
void matchUnrolled(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2D* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
cudaStream_t stream)
{
{
enum {USE_IMG_IDX = 1}
;
const dim3 block(BLOCK_SIZE, BLOCK_SIZE)
;
TrainCollection(const DevMem2D_<T>* trainCollection_, int nImg_, int desclen_) :
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_)
{
}
static __device__ __forceinline__ void store(const int* s_trainIdx, const int* s_imgIdx, const float* s_dist, unsigned int& s_count, int& s_globInd,
for (int i = 0; i < n; ++i)
int* trainIdx, int* imgIdx, float* distance, int maxCount)
{
{
const
int tid = threadIdx.y * blockDim.x + threadIdx.x
;
const
DevMem2D_<T> train = trains[i]
;
if (tid < s_count && s_globInd + tid < maxCount)
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
if (masks != 0 && masks[i].data)
{
{
trainIdx[s_globInd + tid] = s_trainIdx[tid];
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
imgIdx[s_globInd + tid] = s_imgIdx[tid];
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
distance[s_globInd + tid] = s_dist[tid];
}
}
else
if (tid == 0)
{
{
s_globInd += s_count;
matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
s_count = 0
;
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols)
;
}
}
cudaSafeCall( cudaGetLastError() );
}
}
template <int BLOCK_STACK, typename Dist, typename VecDiff, typename Mask>
if (stream == 0)
__device__ void loop(float maxDistance, Mask& mask, const VecDiff& vecDiff,
cudaSafeCall( cudaDeviceSynchronize() );
int* s_trainIdx, int* s_imgIdx, float* s_dist, unsigned int& s_count, int& s_globInd,
}
int* trainIdxRow, int* imgIdxRow, float* distanceRow, int maxCount,
typename Dist::result_type* s_diffRow) const
{
#if __CUDA_ARCH__ >= 120
for (int imgIdx = 0; imgIdx < nImg; ++imgIdx)
{
const DevMem2D_<T> train = trainCollection[imgIdx];
mask.next();
///////////////////////////////////////////////////////////////////////////////
// Match
for (int i = 0; i < train.rows; i += blockDim.y)
template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
{
__global__ void match(const DevMem2D_<T> query, int imgIdx, const DevMem2D_<T> train, float maxDistance, const Mask mask,
int trainIdx = i + threadIdx.y;
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{
#if __CUDA_ARCH__ >= 110
if (trainIdx < train.rows && mask(blockIdx.x, trainIdx))
extern __shared__ int smem[];
{
Dist dist;
vecDiff.calc(train.ptr(trainIdx), desclen, dist, s_diffRow, threadIdx.x);
const typename Dist::result_type val = dist;
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
const int trainIdx = blockIdx.x * BLOCK_SIZE + threadIdx.x;
if (threadIdx.x == 0 && val < maxDistance)
typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
{
typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
unsigned int ind = atomicInc(&s_count, (unsigned int) -1);
s_trainIdx[ind] = trainIdx;
s_imgIdx[ind] = imgIdx;
s_dist[ind] = val;
}
}
__syncthreads()
;
Dist dist
;
if (s_count >= BLOCK_STACK - blockDim.y)
for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
store(s_trainIdx, s_imgIdx, s_dist, s_count, s_globInd, trainIdxRow, imgIdxRow, distanceRow, maxCount);
{
const int loadX = threadIdx.x + i * BLOCK_SIZE;
__syncthreads();
if (loadX < query.cols)
}
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = query.ptr(min(queryIdx, query.rows - 1))[loadX];
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = train.ptr(min(blockIdx.x * BLOCK_SIZE + threadIdx.y, train.rows - 1))[loadX];
}
else
{
s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
}
}
store(s_trainIdx, s_imgIdx, s_dist, s_count, s_globInd, trainIdxRow, imgIdxRow, distanceRow, maxCount
);
__syncthreads(
);
#endif
#pragma unroll
}
for (int j = 0; j < BLOCK_SIZE; ++j)
dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
__device__ __forceinline__ int descLen() const
__syncthreads();
{
return desclen;
}
}
const DevMem2D_<T>* trainCollection;
float distVal = (typename Dist::result_type)dist;
const int nImg;
const int desclen;
};
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_STACK, typename VecDiff, typename Dist, typename T, typename Train, typename Mask>
__global__ void radiusMatch(const PtrStep_<T> query, const Train train, float maxDistance, const Mask mask,
PtrStepi trainIdx, PtrStepi imgIdx, PtrStepf distance, int* nMatches, int maxCount)
{
typedef typename Dist::result_type result_type;
typedef typename Dist::value_type value_type;
__shared__ result_type s_mem[BLOCK_DIM_X * BLOCK_DIM_Y];
__shared__ int s_trainIdx[BLOCK_STACK];
if (queryIdx < query.rows && trainIdx < train.rows && mask(queryIdx, trainIdx) && distVal < maxDistance)
__shared__ int s_imgIdx[Train::USE_IMG_IDX ? BLOCK_STACK : 1];
__shared__ float s_dist[BLOCK_STACK];
__shared__ unsigned int s_count;
__shared__ int s_globInd;
if (threadIdx.x == 0 && threadIdx.y == 0)
{
{
s_count = 0;
unsigned int ind = atomicInc(nMatches + queryIdx, (unsigned int) -1);
s_globInd = 0;
if (ind < maxCount)
{
bestTrainIdx.ptr(queryIdx)[ind] = trainIdx;
if (SAVE_IMG_IDX) bestImgIdx.ptr(queryIdx)[ind] = imgIdx;
bestDistance.ptr(queryIdx)[ind] = distVal;
}
}
}
__syncthreads();
const VecDiff vecDiff(query.ptr(blockIdx.x), train.descLen(), (typename Dist::value_type*)s_mem, threadIdx.y * BLOCK_DIM_X + threadIdx.x, threadIdx.x);
Mask m = mask;
train.template loop<BLOCK_STACK, Dist>(maxDistance, m, vecDiff,
#endif
s_trainIdx, s_imgIdx, s_dist, s_count, s_globInd,
trainIdx.ptr(blockIdx.x), imgIdx.ptr(blockIdx.x), distance.ptr(blockIdx.x), maxCount,
s_mem + BLOCK_DIM_X * threadIdx.y);
if (threadIdx.x == 0 && threadIdx.y == 0)
nMatches[blockIdx.x] = s_globInd;
}
}
///////////////////////////////////////////////////////////////////////////////
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
// Radius Match kernel caller
void match(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_STACK, typename Dist, typename T, typename Train, typename Mask>
void radiusMatchSimple_caller(const DevMem2D_<T>& query, const Train& train, float maxDistance, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, int* nMatches,
cudaStream_t stream)
cudaStream_t stream)
{
{
StaticAssert<BLOCK_STACK >= BLOCK_DIM_Y>::check(
);
const dim3 block(BLOCK_SIZE, BLOCK_SIZE
);
StaticAssert<BLOCK_STACK <= BLOCK_DIM_X * BLOCK_DIM_Y>::check(
);
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE)
);
const dim3 grid(query.rows, 1, 1);
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_STACK, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T>
match<BLOCK_SIZE, false, Dist><<<grid, block, smemSize, stream>>>(query, 0, train, maxDistance, mask,
<<<grid, threads, 0, stream>>>(query, train, maxDistance, mask, trainIdx, imgIdx, distance, nMatches
, trainIdx.cols);
trainIdx, PtrStepi(), distance, nMatches.data
, trainIdx.cols);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
}
template <int BLOCK_
DIM_X, int BLOCK_DIM_Y, int BLOCK_STACK, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Train, typename Mask>
template <int BLOCK_
SIZE, typename Dist, typename T>
void
radiusMatchCached_caller(const DevMem2D_<T>& query, const Train& train, float maxDistance, const Mask& mask
,
void
match(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2D* masks
,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
int*
nMatches,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance,
const DevMem2D_<unsigned int>&
nMatches,
cudaStream_t stream)
cudaStream_t stream)
{
{
StaticAssert<BLOCK_STACK >= BLOCK_DIM_Y>::check();
const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
StaticAssert<BLOCK_STACK <= BLOCK_DIM_X * BLOCK_DIM_Y>::check();
StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check();
StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check();
const dim3 grid(query.rows, 1, 1);
const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_STACK, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T>
for (int i = 0; i < n; ++i)
<<<grid, threads, 0, stream>>>(query, train, maxDistance, mask, trainIdx, imgIdx, distance, nMatches, trainIdx.cols);
{
cudaSafeCall( cudaGetLastError() );
const DevMem2D_<T> train = trains[i];
const dim3 grid(divUp(train.rows, BLOCK_SIZE), divUp(query.rows, BLOCK_SIZE));
if (masks != 0 && masks[i].data)
{
match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, SingleMask(masks[i]),
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
}
else
{
match<BLOCK_SIZE, true, Dist><<<grid, block, smemSize, stream>>>(query, i, train, maxDistance, WithOutMask(),
trainIdx, imgIdx, distance, nMatches.data, trainIdx.cols);
}
cudaSafeCall( cudaGetLastError() );
}
if (stream == 0)
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
cudaSafeCall( cudaDeviceSynchronize() );
}
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
//
Radius Match D
ispatcher
//
Match d
ispatcher
template <typename Dist, typename T, typename
Train, typename Mask>
template <typename Dist, typename T, typename
Mask>
void
radiusMatchDispatcher(const DevMem2D_<T>& query, const Train
& train, float maxDistance, const Mask& mask,
void
matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>
& train, float maxDistance, const Mask& mask,
const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D
& nMatches,
const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>
& nMatches,
cudaStream_t stream)
int cc,
cudaStream_t stream)
{
{
if (query.cols < 64)
if (query.cols <
=
64)
{
{
radiusMatchCached_caller<16, 16, 64, 64, false, Dist>(
matchUnrolled<16, 64, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
query, train, maxDistance, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), (int*)nMatches.data,
stream);
}
}
else if (query.cols
== 64
)
else if (query.cols
<= 128
)
{
{
radiusMatchCached_caller<16, 16, 64, 64, true, Dist>(
matchUnrolled<16, 128, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
query, train, maxDistance, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), (int*)nMatches.data,
stream);
}
}
else if (query.cols <
128
)
else if (query.cols <
= 256
)
{
{
radiusMatchCached_caller<16, 16, 64, 128, false, Dist>(
matchUnrolled<16, 256, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
query, train, maxDistance, mask,
}
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), (int*)nMatches.data,
else if (query.cols <= 512)
stream);
{
matchUnrolled<16, 512, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
}
}
else
if (query.cols == 128)
else
{
{
radiusMatchCached_caller<16, 16, 64, 128, true, Dist>(
match<16, Dist>(query, train, maxDistance, mask, trainIdx, distance, nMatches, stream);
query, train, maxDistance, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), (int*)nMatches.data,
stream);
}
}
else if (query.cols < 256)
}
template <typename Dist, typename T>
void matchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>* trains, int n, float maxDistance, const DevMem2D* masks,
const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches,
int cc, cudaStream_t stream)
{
if (query.cols <= 64)
{
matchUnrolled<16, 64, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
else if (query.cols <= 128)
{
{
radiusMatchCached_caller<16, 16, 64, 256, false, Dist>(
matchUnrolled<16, 128, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
query, train, maxDistance, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), (int*)nMatches.data,
stream);
}
}
else if (query.cols
=
= 256)
else if (query.cols
<
= 256)
{
{
radiusMatchCached_caller<16, 16, 64, 256, true, Dist>(
matchUnrolled<16, 256, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
query, train, maxDistance, mask,
}
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), (int*)nMatches.data,
else if (query.cols <= 512)
stream);
{
matchUnrolled<16, 512, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
else if (query.cols <= 1024)
{
matchUnrolled<16, 1024, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
}
}
else
else
{
{
radiusMatchSimple_caller<16, 16, 64, Dist>(
match<16, Dist>(query, trains, n, maxDistance, masks, trainIdx, imgIdx, distance, nMatches, stream);
query, train, maxDistance, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Di>(imgIdx), static_cast<DevMem2Df>(distance), (int*)nMatches.data,
stream);
}
}
}
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Radius Match caller
// Radius Match caller
template <typename T> void
radiusMatchSingleL1_gpu(const DevMem2D& query, const DevMem2D& train_
, float maxDistance, const DevMem2D& mask,
template <typename T> void
matchL1_gpu(const DevMem2D& query, const DevMem2D& train
, float maxDistance, const DevMem2D& mask,
const DevMem2D
& trainIdx, const DevMem2D& distance, const DevMem2D
& nMatches,
const DevMem2D
i& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>
& nMatches,
cudaStream_t stream)
int cc,
cudaStream_t stream)
{
{
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
if (mask.data)
if (mask.data)
{
{
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train
, maxDistance, SingleMask(mask),
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train)
, maxDistance, SingleMask(mask),
trainIdx,
DevMem2D(),
distance, nMatches,
trainIdx, distance, nMatches,
stream);
cc,
stream);
}
}
else
else
{
{
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train
, maxDistance, WithOutMask(),
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train)
, maxDistance, WithOutMask(),
trainIdx,
DevMem2D(),
distance, nMatches,
trainIdx, distance, nMatches,
stream);
cc,
stream);
}
}
}
}
template void
radiusMatchSingleL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
matchL1_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
radiusMatchSingleL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
matchL1_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
radiusMatchSingleL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
matchL1_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
radiusMatchSingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
matchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
radiusMatchSingleL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
matchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
radiusMatchSingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
matchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template <typename T> void
radiusMatchSingleL2_gpu(const DevMem2D& query, const DevMem2D& train_
, float maxDistance, const DevMem2D& mask,
template <typename T> void
matchL2_gpu(const DevMem2D& query, const DevMem2D& train
, float maxDistance, const DevMem2D& mask,
const DevMem2D
& trainIdx, const DevMem2D& distance, const DevMem2D
& nMatches,
const DevMem2D
i& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>
& nMatches,
cudaStream_t stream)
int cc,
cudaStream_t stream)
{
{
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
if (mask.data)
if (mask.data)
{
{
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train
, maxDistance, SingleMask(mask),
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train)
, maxDistance, SingleMask(mask),
trainIdx,
DevMem2D(),
distance, nMatches,
trainIdx, distance, nMatches,
stream);
cc,
stream);
}
}
else
else
{
{
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train
, maxDistance, WithOutMask(),
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train)
, maxDistance, WithOutMask(),
trainIdx,
DevMem2D(),
distance, nMatches,
trainIdx, distance, nMatches,
stream);
cc,
stream);
}
}
}
}
//template void
radiusMatchSingleL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
matchL2_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
radiusMatchSingleL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
matchL2_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
radiusMatchSingleL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
matchL2_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
radiusMatchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
matchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
radiusMatchSingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
matchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
radiusMatchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
matchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template <typename T> void
radiusMatchSingleHamming_gpu(const DevMem2D& query, const DevMem2D& train_
, float maxDistance, const DevMem2D& mask,
template <typename T> void
matchHamming_gpu(const DevMem2D& query, const DevMem2D& train
, float maxDistance, const DevMem2D& mask,
const DevMem2D
& trainIdx, const DevMem2D& distance, const DevMem2D
& nMatches,
const DevMem2D
i& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>
& nMatches,
cudaStream_t stream)
int cc,
cudaStream_t stream)
{
{
SingleTrain<T> train(static_cast< DevMem2D_<T> >(train_));
if (mask.data)
if (mask.data)
{
{
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train
, maxDistance, SingleMask(mask),
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train)
, maxDistance, SingleMask(mask),
trainIdx,
DevMem2D(),
distance, nMatches,
trainIdx, distance, nMatches,
stream);
cc,
stream);
}
}
else
else
{
{
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train
, maxDistance, WithOutMask(),
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train)
, maxDistance, WithOutMask(),
trainIdx,
DevMem2D(),
distance, nMatches,
trainIdx, distance, nMatches,
stream);
cc,
stream);
}
}
}
}
template void
radiusMatchSingleHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
matchHamming_gpu<uchar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
radiusMatchSingleHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
matchHamming_gpu<schar >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
radiusMatchSingleHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
matchHamming_gpu<ushort>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
radiusMatchSingleHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
matchHamming_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
radiusMatchSingleHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2D& trainIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
matchHamming_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template <typename T> void
radiusMatchCollectionL1_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection
,
template <typename T> void
matchL1_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks
,
const DevMem2D
& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D
& nMatches,
const DevMem2D
i& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>
& nMatches,
cudaStream_t stream)
int cc,
cudaStream_t stream)
{
{
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
matchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches,
if (maskCollection.data)
cc, stream);
{
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, maxDistance, MaskCollection(maskCollection.data),
trainIdx, imgIdx, distance, nMatches,
stream);
}
else
{
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), train, maxDistance, WithOutMask(),
trainIdx, imgIdx, distance, nMatches,
stream);
}
}
}
template void
radiusMatchCollectionL1_gpu<uchar >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
matchL1_gpu<uchar >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
radiusMatchCollectionL1_gpu<schar >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
matchL1_gpu<schar >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
radiusMatchCollectionL1_gpu<ushort>(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
matchL1_gpu<ushort>(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
radiusMatchCollectionL1_gpu<short >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
matchL1_gpu<short >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
radiusMatchCollectionL1_gpu<int >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
matchL1_gpu<int >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
radiusMatchCollectionL1_gpu<float >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
matchL1_gpu<float >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template <typename T> void
radiusMatchCollectionL2_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection
,
template <typename T> void
matchL2_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks
,
const DevMem2D
& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D
& nMatches,
const DevMem2D
i& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>
& nMatches,
cudaStream_t stream)
int cc,
cudaStream_t stream)
{
{
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
matchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches,
if (maskCollection.data)
cc, stream);
{
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, maxDistance, MaskCollection(maskCollection.data),
trainIdx, imgIdx, distance, nMatches,
stream);
}
else
{
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), train, maxDistance, WithOutMask(),
trainIdx, imgIdx, distance, nMatches,
stream);
}
}
}
//template void
radiusMatchCollectionL2_gpu<uchar >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
matchL2_gpu<uchar >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
radiusMatchCollectionL2_gpu<schar >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
matchL2_gpu<schar >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
radiusMatchCollectionL2_gpu<ushort>(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
matchL2_gpu<ushort>(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
radiusMatchCollectionL2_gpu<short >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
matchL2_gpu<short >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
radiusMatchCollectionL2_gpu<int >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
matchL2_gpu<int >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
radiusMatchCollectionL2_gpu<float >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
matchL2_gpu<float >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template <typename T> void
radiusMatchCollectionHamming_gpu(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection
,
template <typename T> void
matchHamming_gpu(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks
,
const DevMem2D
& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D
& nMatches,
const DevMem2D
i& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>
& nMatches,
cudaStream_t stream)
int cc,
cudaStream_t stream)
{
{
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, query.cols);
matchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), (const DevMem2D_<T>*)trains, n, maxDistance, masks,
trainIdx, imgIdx, distance, nMatches,
if (maskCollection.data)
cc, stream);
{
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, maxDistance, MaskCollection(maskCollection.data),
trainIdx, imgIdx, distance, nMatches,
stream);
}
else
{
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), train, maxDistance, WithOutMask(),
trainIdx, imgIdx, distance, nMatches,
stream);
}
}
}
template void
radiusMatchCollectionHamming_gpu<uchar >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
matchHamming_gpu<uchar >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
radiusMatchCollectionHamming_gpu<schar >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
matchHamming_gpu<schar >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
radiusMatchCollectionHamming_gpu<ushort>(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
matchHamming_gpu<ushort>(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
//template void
radiusMatchCollectionHamming_gpu<short >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
//template void
matchHamming_gpu<short >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
template void
radiusMatchCollectionHamming_gpu<int >(const DevMem2D& query, const DevMem2D& trainCollection, float maxDistance, const DevMem2D_<PtrStep>& maskCollection, const DevMem2D& trainIdx, const DevMem2D& imgIdx, const DevMem2D& distance, const DevMem2D& nMatches
, cudaStream_t stream);
template void
matchHamming_gpu<int >(const DevMem2D& query, const DevMem2D* trains, int n, float maxDistance, const DevMem2D* masks, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance, const DevMem2D_<unsigned int>& nMatches, int cc
, cudaStream_t stream);
}}}
}}}
modules/gpu/src/opencv2/gpu/device/detail/utility_detail.hpp
View file @
d3c4e907
...
@@ -47,6 +47,9 @@ namespace cv { namespace gpu { namespace device
...
@@ -47,6 +47,9 @@ namespace cv { namespace gpu { namespace device
{
{
namespace
detail
namespace
detail
{
{
///////////////////////////////////////////////////////////////////////////////
// Reduction
template
<
int
n
>
struct
WarpReductor
template
<
int
n
>
struct
WarpReductor
{
{
template
<
typename
T
,
typename
Op
>
static
__device__
__forceinline__
void
reduce
(
volatile
T
*
data
,
T
&
partial_reduction
,
int
tid
,
const
Op
&
op
)
template
<
typename
T
,
typename
Op
>
static
__device__
__forceinline__
void
reduce
(
volatile
T
*
data
,
T
&
partial_reduction
,
int
tid
,
const
Op
&
op
)
...
@@ -209,6 +212,8 @@ namespace cv { namespace gpu { namespace device
...
@@ -209,6 +212,8 @@ namespace cv { namespace gpu { namespace device
}
}
};
};
///////////////////////////////////////////////////////////////////////////////
// PredValWarpReductor
template
<
int
n
>
struct
PredValWarpReductor
;
template
<
int
n
>
struct
PredValWarpReductor
;
template
<>
struct
PredValWarpReductor
<
64
>
template
<>
struct
PredValWarpReductor
<
64
>
...
@@ -501,6 +506,335 @@ namespace cv { namespace gpu { namespace device
...
@@ -501,6 +506,335 @@ namespace cv { namespace gpu { namespace device
}
}
}
}
};
};
///////////////////////////////////////////////////////////////////////////////
// PredVal2WarpReductor
template
<
int
n
>
struct
PredVal2WarpReductor
;
template
<>
struct
PredVal2WarpReductor
<
64
>
{
template
<
typename
T
,
typename
V1
,
typename
V2
,
typename
Pred
>
static
__device__
void
reduce
(
T
&
myData
,
V1
&
myVal1
,
V2
&
myVal2
,
volatile
T
*
sdata
,
V1
*
sval1
,
V2
*
sval2
,
int
tid
,
const
Pred
&
pred
)
{
if
(
tid
<
32
)
{
myData
=
sdata
[
tid
];
myVal1
=
sval1
[
tid
];
myVal2
=
sval2
[
tid
];
T
reg
=
sdata
[
tid
+
32
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
32
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
32
];
}
reg
=
sdata
[
tid
+
16
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
16
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
16
];
}
reg
=
sdata
[
tid
+
8
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
8
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
8
];
}
reg
=
sdata
[
tid
+
4
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
4
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
4
];
}
reg
=
sdata
[
tid
+
2
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
2
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
2
];
}
reg
=
sdata
[
tid
+
1
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
1
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
1
];
}
}
}
};
template
<>
struct
PredVal2WarpReductor
<
32
>
{
template
<
typename
T
,
typename
V1
,
typename
V2
,
typename
Pred
>
static
__device__
void
reduce
(
T
&
myData
,
V1
&
myVal1
,
V2
&
myVal2
,
volatile
T
*
sdata
,
V1
*
sval1
,
V2
*
sval2
,
int
tid
,
const
Pred
&
pred
)
{
if
(
tid
<
16
)
{
myData
=
sdata
[
tid
];
myVal1
=
sval1
[
tid
];
myVal2
=
sval2
[
tid
];
T
reg
=
sdata
[
tid
+
16
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
16
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
16
];
}
reg
=
sdata
[
tid
+
8
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
8
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
8
];
}
reg
=
sdata
[
tid
+
4
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
4
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
4
];
}
reg
=
sdata
[
tid
+
2
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
2
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
2
];
}
reg
=
sdata
[
tid
+
1
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
1
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
1
];
}
}
}
};
template
<>
struct
PredVal2WarpReductor
<
16
>
{
template
<
typename
T
,
typename
V1
,
typename
V2
,
typename
Pred
>
static
__device__
void
reduce
(
T
&
myData
,
V1
&
myVal1
,
V2
&
myVal2
,
volatile
T
*
sdata
,
V1
*
sval1
,
V2
*
sval2
,
int
tid
,
const
Pred
&
pred
)
{
if
(
tid
<
8
)
{
myData
=
sdata
[
tid
];
myVal1
=
sval1
[
tid
];
myVal2
=
sval2
[
tid
];
T
reg
=
reg
=
sdata
[
tid
+
8
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
8
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
8
];
}
reg
=
sdata
[
tid
+
4
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
4
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
4
];
}
reg
=
sdata
[
tid
+
2
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
2
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
2
];
}
reg
=
sdata
[
tid
+
1
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
1
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
1
];
}
}
}
};
template
<>
struct
PredVal2WarpReductor
<
8
>
{
template
<
typename
T
,
typename
V1
,
typename
V2
,
typename
Pred
>
static
__device__
void
reduce
(
T
&
myData
,
V1
&
myVal1
,
V2
&
myVal2
,
volatile
T
*
sdata
,
V1
*
sval1
,
V2
*
sval2
,
int
tid
,
const
Pred
&
pred
)
{
if
(
tid
<
4
)
{
myData
=
sdata
[
tid
];
myVal1
=
sval1
[
tid
];
myVal2
=
sval2
[
tid
];
T
reg
=
reg
=
sdata
[
tid
+
4
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
4
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
4
];
}
reg
=
sdata
[
tid
+
2
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
2
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
2
];
}
reg
=
sdata
[
tid
+
1
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
1
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
1
];
}
}
}
};
template
<
bool
warp
>
struct
PredVal2ReductionDispatcher
;
template
<>
struct
PredVal2ReductionDispatcher
<
true
>
{
template
<
int
n
,
typename
T
,
typename
V1
,
typename
V2
,
typename
Pred
>
static
__device__
void
reduce
(
T
&
myData
,
V1
&
myVal1
,
V2
&
myVal2
,
volatile
T
*
sdata
,
V1
*
sval1
,
V2
*
sval2
,
int
tid
,
const
Pred
&
pred
)
{
PredVal2WarpReductor
<
n
>::
reduce
(
myData
,
myVal1
,
myVal2
,
sdata
,
sval1
,
sval2
,
tid
,
pred
);
}
};
template
<>
struct
PredVal2ReductionDispatcher
<
false
>
{
template
<
int
n
,
typename
T
,
typename
V1
,
typename
V2
,
typename
Pred
>
static
__device__
void
reduce
(
T
&
myData
,
V1
&
myVal1
,
V2
&
myVal2
,
volatile
T
*
sdata
,
V1
*
sval1
,
V2
*
sval2
,
int
tid
,
const
Pred
&
pred
)
{
myData
=
sdata
[
tid
];
myVal1
=
sval1
[
tid
];
myVal2
=
sval2
[
tid
];
if
(
n
>=
512
&&
tid
<
256
)
{
T
reg
=
sdata
[
tid
+
256
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
256
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
256
];
}
__syncthreads
();
}
if
(
n
>=
256
&&
tid
<
128
)
{
T
reg
=
sdata
[
tid
+
128
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
128
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
128
];
}
__syncthreads
();
}
if
(
n
>=
128
&&
tid
<
64
)
{
T
reg
=
sdata
[
tid
+
64
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
64
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
64
];
}
__syncthreads
();
}
if
(
tid
<
32
)
{
if
(
n
>=
64
)
{
T
reg
=
sdata
[
tid
+
32
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
32
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
32
];
}
}
if
(
n
>=
32
)
{
T
reg
=
sdata
[
tid
+
16
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
16
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
16
];
}
}
if
(
n
>=
16
)
{
T
reg
=
sdata
[
tid
+
8
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
8
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
8
];
}
}
if
(
n
>=
8
)
{
T
reg
=
sdata
[
tid
+
4
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
4
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
4
];
}
}
if
(
n
>=
4
)
{
T
reg
=
sdata
[
tid
+
2
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
2
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
2
];
}
}
if
(
n
>=
2
)
{
T
reg
=
sdata
[
tid
+
1
];
if
(
pred
(
reg
,
myData
))
{
sdata
[
tid
]
=
myData
=
reg
;
sval1
[
tid
]
=
myVal1
=
sval1
[
tid
+
1
];
sval2
[
tid
]
=
myVal2
=
sval2
[
tid
+
1
];
}
}
}
}
};
}
}
}}}
}}}
...
...
modules/gpu/src/opencv2/gpu/device/utility.hpp
View file @
d3c4e907
...
@@ -121,7 +121,6 @@ namespace cv { namespace gpu { namespace device
...
@@ -121,7 +121,6 @@ namespace cv { namespace gpu { namespace device
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Reduction
// Reduction
// reduction
template
<
int
n
,
typename
T
,
typename
Op
>
__device__
__forceinline__
void
reduce
(
volatile
T
*
data
,
T
&
partial_reduction
,
int
tid
,
const
Op
&
op
)
template
<
int
n
,
typename
T
,
typename
Op
>
__device__
__forceinline__
void
reduce
(
volatile
T
*
data
,
T
&
partial_reduction
,
int
tid
,
const
Op
&
op
)
{
{
StaticAssert
<
n
>=
8
&&
n
<=
512
>::
check
();
StaticAssert
<
n
>=
8
&&
n
<=
512
>::
check
();
...
@@ -134,6 +133,13 @@ namespace cv { namespace gpu { namespace device
...
@@ -134,6 +133,13 @@ namespace cv { namespace gpu { namespace device
StaticAssert
<
n
>=
8
&&
n
<=
512
>::
check
();
StaticAssert
<
n
>=
8
&&
n
<=
512
>::
check
();
detail
::
PredValReductionDispatcher
<
n
<=
64
>::
reduce
<
n
>
(
myData
,
myVal
,
sdata
,
sval
,
tid
,
pred
);
detail
::
PredValReductionDispatcher
<
n
<=
64
>::
reduce
<
n
>
(
myData
,
myVal
,
sdata
,
sval
,
tid
,
pred
);
}
}
template
<
int
n
,
typename
T
,
typename
V1
,
typename
V2
,
typename
Pred
>
__device__
__forceinline__
void
reducePredVal2
(
volatile
T
*
sdata
,
T
&
myData
,
V1
*
sval1
,
V1
&
myVal1
,
V2
*
sval2
,
V2
&
myVal2
,
int
tid
,
const
Pred
&
pred
)
{
StaticAssert
<
n
>=
8
&&
n
<=
512
>::
check
();
detail
::
PredVal2ReductionDispatcher
<
n
<=
64
>::
reduce
<
n
>
(
myData
,
myVal1
,
myVal2
,
sdata
,
sval1
,
sval2
,
tid
,
pred
);
}
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Solve linear system
// Solve linear system
...
...
modules/stitching/src/matchers.cpp
View file @
d3c4e907
...
@@ -198,7 +198,7 @@ void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
...
@@ -198,7 +198,7 @@ void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
// Find 1->2 matches
// Find 1->2 matches
pair_matches
.
clear
();
pair_matches
.
clear
();
matcher
.
knnMatch
(
descriptors1_
,
descriptors2_
,
train_idx_
,
distance_
,
all_dist_
,
2
);
matcher
.
knnMatch
Single
(
descriptors1_
,
descriptors2_
,
train_idx_
,
distance_
,
all_dist_
,
2
);
matcher
.
knnMatchDownload
(
train_idx_
,
distance_
,
pair_matches
);
matcher
.
knnMatchDownload
(
train_idx_
,
distance_
,
pair_matches
);
for
(
size_t
i
=
0
;
i
<
pair_matches
.
size
();
++
i
)
for
(
size_t
i
=
0
;
i
<
pair_matches
.
size
();
++
i
)
{
{
...
@@ -215,7 +215,7 @@ void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
...
@@ -215,7 +215,7 @@ void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
// Find 2->1 matches
// Find 2->1 matches
pair_matches
.
clear
();
pair_matches
.
clear
();
matcher
.
knnMatch
(
descriptors2_
,
descriptors1_
,
train_idx_
,
distance_
,
all_dist_
,
2
);
matcher
.
knnMatch
Single
(
descriptors2_
,
descriptors1_
,
train_idx_
,
distance_
,
all_dist_
,
2
);
matcher
.
knnMatchDownload
(
train_idx_
,
distance_
,
pair_matches
);
matcher
.
knnMatchDownload
(
train_idx_
,
distance_
,
pair_matches
);
for
(
size_t
i
=
0
;
i
<
pair_matches
.
size
();
++
i
)
for
(
size_t
i
=
0
;
i
<
pair_matches
.
size
();
++
i
)
{
{
...
...
samples/gpu/performance/tests.cpp
View file @
d3c4e907
...
@@ -413,38 +413,55 @@ TEST(BruteForceMatcher)
...
@@ -413,38 +413,55 @@ TEST(BruteForceMatcher)
// Output
// Output
vector
<
vector
<
DMatch
>
>
matches
(
2
);
vector
<
vector
<
DMatch
>
>
matches
(
2
);
vector
<
vector
<
DMatch
>
>
d_matches
(
2
)
;
gpu
::
GpuMat
d_trainIdx
,
d_distance
,
d_allDist
,
d_nMatches
;
SUBTEST
<<
"match"
;
SUBTEST
<<
"match"
;
matcher
.
match
(
query
,
train
,
matches
[
0
]);
CPU_ON
;
CPU_ON
;
matcher
.
match
(
query
,
train
,
matches
[
0
]);
matcher
.
match
(
query
,
train
,
matches
[
0
]);
CPU_OFF
;
CPU_OFF
;
d_matcher
.
matchSingle
(
d_query
,
d_train
,
d_trainIdx
,
d_distance
);
GPU_ON
;
GPU_ON
;
d_matcher
.
match
(
d_query
,
d_train
,
d_matches
[
0
]
);
d_matcher
.
match
Single
(
d_query
,
d_train
,
d_trainIdx
,
d_distance
);
GPU_OFF
;
GPU_OFF
;
SUBTEST
<<
"knnMatch"
;
SUBTEST
<<
"knnMatch, 2"
;
int
knn
=
2
;
matcher
.
knnMatch
(
query
,
train
,
matches
,
2
);
CPU_ON
;
CPU_ON
;
matcher
.
knnMatch
(
query
,
train
,
matches
,
knn
);
matcher
.
knnMatch
(
query
,
train
,
matches
,
2
);
CPU_OFF
;
CPU_OFF
;
d_matcher
.
knnMatchSingle
(
d_query
,
d_train
,
d_trainIdx
,
d_distance
,
d_allDist
,
2
);
GPU_ON
;
GPU_ON
;
d_matcher
.
knnMatch
(
d_query
,
d_train
,
d_matches
,
knn
);
d_matcher
.
knnMatchSingle
(
d_query
,
d_train
,
d_trainIdx
,
d_distance
,
d_allDist
,
2
);
GPU_OFF
;
SUBTEST
<<
"knnMatch, 3"
;
matcher
.
knnMatch
(
query
,
train
,
matches
,
3
);
CPU_ON
;
matcher
.
knnMatch
(
query
,
train
,
matches
,
3
);
CPU_OFF
;
d_matcher
.
knnMatchSingle
(
d_query
,
d_train
,
d_trainIdx
,
d_distance
,
d_allDist
,
3
);
GPU_ON
;
d_matcher
.
knnMatchSingle
(
d_query
,
d_train
,
d_trainIdx
,
d_distance
,
d_allDist
,
3
);
GPU_OFF
;
GPU_OFF
;
SUBTEST
<<
"radiusMatch"
;
SUBTEST
<<
"radiusMatch"
;
float
max_distance
=
2.0
f
;
float
max_distance
=
2.0
f
;
matcher
.
radiusMatch
(
query
,
train
,
matches
,
max_distance
);
CPU_ON
;
CPU_ON
;
matcher
.
radiusMatch
(
query
,
train
,
matches
,
max_distance
);
matcher
.
radiusMatch
(
query
,
train
,
matches
,
max_distance
);
CPU_OFF
;
CPU_OFF
;
d_matcher
.
radiusMatchSingle
(
d_query
,
d_train
,
d_trainIdx
,
d_distance
,
d_nMatches
,
max_distance
);
GPU_ON
;
GPU_ON
;
d_matcher
.
radiusMatch
(
d_query
,
d_train
,
d_m
atches
,
max_distance
);
d_matcher
.
radiusMatch
Single
(
d_query
,
d_train
,
d_trainIdx
,
d_distance
,
d_nM
atches
,
max_distance
);
GPU_OFF
;
GPU_OFF
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment