Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in / Register
Toggle navigation
O
opencv
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Packages
Packages
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
submodule
opencv
Commits
b119833a
Commit
b119833a
authored
Sep 26, 2011
by
Vladislav Vinogradov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
implemented optimized version of gpu::bf_radius_match
parent
961dc4e3
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
160 additions
and
44 deletions
+160
-44
brute_force_matcher.cpp
modules/gpu/src/brute_force_matcher.cpp
+12
-6
bf_knnmatch.cu
modules/gpu/src/cuda/bf_knnmatch.cu
+1
-1
bf_match.cu
modules/gpu/src/cuda/bf_match.cu
+1
-1
bf_radius_match.cu
modules/gpu/src/cuda/bf_radius_match.cu
+146
-36
No files found.
modules/gpu/src/brute_force_matcher.cpp
View file @
b119833a
...
...
@@ -76,7 +76,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, std::vector
#else
/* !defined (HAVE_CUDA) */
namespace
cv
{
namespace
gpu
{
namespace
bf
matcher
namespace
cv
{
namespace
gpu
{
namespace
bf
_match
{
template
<
typename
T
>
void
matchSingleL1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
...
...
@@ -97,7 +97,10 @@ namespace cv { namespace gpu { namespace bfmatcher
template
<
typename
T
>
void
matchCollectionHamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
trainCollection
,
const
DevMem2D_
<
PtrStep
>&
maskCollection
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
int
cc
,
cudaStream_t
stream
);
}}}
namespace
cv
{
namespace
gpu
{
namespace
bf_knnmatch
{
template
<
typename
T
>
void
knnMatchL1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
int
k
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
allDist
,
int
cc
,
cudaStream_t
stream
);
...
...
@@ -107,7 +110,10 @@ namespace cv { namespace gpu { namespace bfmatcher
template
<
typename
T
>
void
knnMatchHamming_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
int
k
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
allDist
,
int
cc
,
cudaStream_t
stream
);
}}}
namespace
cv
{
namespace
gpu
{
namespace
bf_radius_match
{
template
<
typename
T
>
void
radiusMatchL1_gpu
(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
float
maxDistance
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
nMatches
,
const
DevMem2D
&
distance
,
cudaStream_t
stream
);
...
...
@@ -170,7 +176,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs,
if
(
queryDescs
.
empty
()
||
trainDescs
.
empty
())
return
;
using
namespace
cv
::
gpu
::
bf
matcher
;
using
namespace
cv
::
gpu
::
bf
_match
;
typedef
void
(
*
match_caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
...
...
@@ -309,7 +315,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDes
if
(
queryDescs
.
empty
()
||
trainCollection
.
empty
())
return
;
using
namespace
cv
::
gpu
::
bf
matcher
;
using
namespace
cv
::
gpu
::
bf
_match
;
typedef
void
(
*
match_caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
trainCollection
,
const
DevMem2D_
<
PtrStep
>&
maskCollection
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
imgIdx
,
const
DevMem2D
&
distance
,
...
...
@@ -418,7 +424,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, con
if
(
queryDescs
.
empty
()
||
trainDescs
.
empty
())
return
;
using
namespace
cv
::
gpu
::
bf
matcher
;
using
namespace
cv
::
gpu
::
bf
_knnmatch
;
typedef
void
(
*
match_caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
int
k
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
distance
,
const
DevMem2D
&
allDist
,
...
...
@@ -596,7 +602,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
if
(
queryDescs
.
empty
()
||
trainDescs
.
empty
())
return
;
using
namespace
cv
::
gpu
::
bf
matcher
;
using
namespace
cv
::
gpu
::
bf
_radius_match
;
typedef
void
(
*
radiusMatch_caller_t
)(
const
DevMem2D
&
query
,
const
DevMem2D
&
train
,
float
maxDistance
,
const
DevMem2D
&
mask
,
const
DevMem2D
&
trainIdx
,
const
DevMem2D
&
nMatches
,
const
DevMem2D
&
distance
,
...
...
@@ -618,7 +624,7 @@ void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs,
}
};
CV_Assert
(
TargetArchs
::
builtWith
(
GLOBAL
_ATOMICS
)
&&
DeviceInfo
().
supports
(
GLOBAL_ATOMICS
));
CV_Assert
(
TargetArchs
::
builtWith
(
SHARED
_ATOMICS
)
&&
DeviceInfo
().
supports
(
GLOBAL_ATOMICS
));
const
int
nQuery
=
queryDescs
.
rows
;
const
int
nTrain
=
trainDescs
.
rows
;
...
...
modules/gpu/src/cuda/bf_knnmatch.cu
View file @
b119833a
...
...
@@ -47,7 +47,7 @@
using namespace cv::gpu;
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace bf
matcher
namespace cv { namespace gpu { namespace bf
_knnmatch
{
template <typename VecDiff, typename Dist, typename T, typename Mask>
__device__ void distanceCalcLoop(const PtrStep_<T>& query, const DevMem2D_<T>& train, const Mask& m, int queryIdx,
...
...
modules/gpu/src/cuda/bf_match.cu
View file @
b119833a
...
...
@@ -47,7 +47,7 @@
using namespace cv::gpu;
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace bf
matcher
namespace cv { namespace gpu { namespace bf
_match
{
template <int BLOCK_DIM_Y, typename T>
__device__ void findBestMatch(T& myDist, int2& myIdx, T* smin, int2* sIdx)
...
...
modules/gpu/src/cuda/bf_radius_match.cu
View file @
b119833a
...
...
@@ -47,63 +47,127 @@
using namespace cv::gpu;
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace bf
matcher
namespace cv { namespace gpu { namespace bf
_radius_match
{
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
__global__ void radiusMatch(const PtrStep_<T> query, const DevMem2D_<T> train, float maxDistance, const Mask mask,
DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance)
__device__ __forceinline__ void store(const int* sidx, const float* sdist, const unsigned int scount, int* trainIdx, float* distance, int& sglob_ind, const int tid)
{
#if __CUDA_ARCH__ >= 110
if (tid < scount)
{
trainIdx[sglob_ind + tid] = sidx[tid];
distance[sglob_ind + tid] = sdist[tid];
}
__shared__ typename Dist::result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];
if (tid == 0)
sglob_ind += scount;
}
typename Dist::result_type* sdiff_row = smem + BLOCK_DIM_X * threadIdx.y;
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_STACK, typename VecDiff, typename Dist, typename T, typename Mask>
__global__ void radiusMatch(const PtrStep_<T> query, const DevMem2D_<T> train, const float maxDistance, const Mask mask,
DevMem2Di trainIdx_, PtrStepf distance, unsigned int* nMatches)
{
#if __CUDA_ARCH__ >= 120
const int queryIdx = blockIdx.x;
const T* queryDescs = query.ptr(queryIdx);
typedef typename Dist::result_type result_type;
typedef typename Dist::value_type value_type;
__shared__ result_type smem[BLOCK_DIM_X * BLOCK_DIM_Y];
__shared__ int sidx[BLOCK_STACK];
__shared__ float sdist[BLOCK_STACK];
__shared__ unsigned int scount;
__shared__ int sglob_ind;
const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
const int queryIdx = blockIdx.x;
const int tid = threadIdx.y * BLOCK_DIM_X + threadIdx.x;
if (t
rainIdx < train.rows
)
if (t
id == 0
)
{
const T* trainDescs = train.ptr(trainIdx);
scount = 0;
sglob_ind = 0;
}
__syncthreads();
int* trainIdx_row = trainIdx_.ptr(queryIdx);
float* distance_row = distance.ptr(queryIdx);
const VecDiff vecDiff(query.ptr(queryIdx), train.cols, (typename Dist::value_type*)smem, tid, threadIdx.x);
typename Dist::result_type* sdiffRow = smem + BLOCK_DIM_X * threadIdx.y;
for (int trainIdx = threadIdx.y; trainIdx < train.rows; trainIdx += BLOCK_DIM_Y)
{
if (mask(queryIdx, trainIdx))
{
Dist dist;
c
alcVecDiffGlobal<BLOCK_DIM_X>(queryDescs, trainDescs, train.cols, dist, sdiff_row, threadIdx.
x);
c
onst T* trainRow = train.ptr(trainId
x);
if (threadIdx.x == 0)
{
if (dist < maxDistance)
{
unsigned int i = atomicInc(nMatches + queryIdx, (unsigned int) -1);
if (i < trainIdx_.cols)
vecDiff.calc(trainRow, train.cols, dist, sdiffRow, threadIdx.x);
const typename Dist::result_type val = dist;
if (threadIdx.x == 0 && val < maxDistance)
{
distance.ptr(queryIdx)[i] = dist
;
trainIdx_.ptr(queryIdx)
[i] = trainIdx;
}
unsigned int i = atomicInc(&scount, (unsigned int) -1)
;
sidx
[i] = trainIdx;
sdist[i] = val;
}
}
__syncthreads();
if (scount > BLOCK_STACK - BLOCK_DIM_Y)
{
store(sidx, sdist, scount, trainIdx_row, distance_row, sglob_ind, tid);
if (tid == 0)
scount = 0;
}
__syncthreads();
}
store(sidx, sdist, scount, trainIdx_row, distance_row, sglob_ind, tid);
if (tid == 0)
nMatches[queryIdx] = sglob_ind;
#endif
}
///////////////////////////////////////////////////////////////////////////////
// Radius Match kernel caller
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
void radiusMatch_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2D
_<unsigned int>& nMatches, const DevMem2Df& distance,
template <int BLOCK_DIM_X, int BLOCK_DIM_Y,
int BLOCK_STACK,
typename Dist, typename T, typename Mask>
void radiusMatch
Simple
_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2D
f& distance, unsigned int* nMatches,
cudaStream_t stream)
{
StaticAssert<BLOCK_STACK >= BLOCK_DIM_Y>::check();
StaticAssert<BLOCK_STACK <= BLOCK_DIM_X * BLOCK_DIM_Y>::check();
const dim3 grid(query.rows, 1, 1);
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
const dim3 grid(query.rows, divUp(train.rows, BLOCK_DIM_Y), 1);
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist, T><<<grid, threads, 0, stream>>>(query, train, maxDistance, mask, trainIdx, nMatches.data, distance);
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_STACK, VecDiffGlobal<BLOCK_DIM_X, T>, Dist, T>
<<<grid, threads, 0, stream>>>(query, train, maxDistance, mask, trainIdx, distance, nMatches);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int BLOCK_STACK, int MAX_LEN, bool LEN_EQ_MAX_LEN, typename Dist, typename T, typename Mask>
void radiusMatchCached_caller(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
const DevMem2Di& trainIdx, const DevMem2Df& distance, unsigned int* nMatches,
cudaStream_t stream)
{
StaticAssert<BLOCK_STACK >= BLOCK_DIM_Y>::check();
StaticAssert<BLOCK_STACK <= BLOCK_DIM_X * BLOCK_DIM_Y>::check();
StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_LEN>::check();
StaticAssert<MAX_LEN % BLOCK_DIM_X == 0>::check();
const dim3 grid(query.rows, 1, 1);
const dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_STACK, VecDiffCachedRegister<BLOCK_DIM_X, MAX_LEN, LEN_EQ_MAX_LEN, typename Dist::value_type>, Dist, T>
<<<grid, threads, 0, stream>>>(query, train, maxDistance, mask, trainIdx, distance, nMatches);
cudaSafeCall( cudaGetLastError() );
if (stream == 0)
...
...
@@ -115,13 +179,59 @@ namespace cv { namespace gpu { namespace bfmatcher
template <typename Dist, typename T, typename Mask>
void radiusMatchDispatcher(const DevMem2D_<T>& query, const DevMem2D_<T>& train, float maxDistance, const Mask& mask,
const DevMem2D& trainIdx, const DevMem2D&
nMatches, const DevMem2D& distance
,
const DevMem2D& trainIdx, const DevMem2D&
distance, const DevMem2D& nMatches
,
cudaStream_t stream)
{
radiusMatch_caller<16, 16, Dist>(query, train, maxDistance, mask,
static_cast<DevMem2Di>(trainIdx), static_cast< const DevMem2D_<unsigned int> >(nMatches), static_cast<DevMem2Df>(distance),
if (query.cols < 64)
{
radiusMatchCached_caller<16, 16, 64, 64, false, Dist>(
query, train, maxDistance, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), (unsigned int*)nMatches.data,
stream);
}
else if (query.cols == 64)
{
radiusMatchCached_caller<16, 16, 64, 64, true, Dist>(
query, train, maxDistance, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), (unsigned int*)nMatches.data,
stream);
}
else if (query.cols < 128)
{
radiusMatchCached_caller<16, 16, 64, 128, false, Dist>(
query, train, maxDistance, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), (unsigned int*)nMatches.data,
stream);
}
else if (query.cols == 128)
{
radiusMatchCached_caller<16, 16, 64, 128, true, Dist>(
query, train, maxDistance, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), (unsigned int*)nMatches.data,
stream);
}
else if (query.cols < 256)
{
radiusMatchCached_caller<16, 16, 64, 256, false, Dist>(
query, train, maxDistance, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), (unsigned int*)nMatches.data,
stream);
}
else if (query.cols == 256)
{
radiusMatchCached_caller<16, 16, 64, 256, true, Dist>(
query, train, maxDistance, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), (unsigned int*)nMatches.data,
stream);
}
else
{
radiusMatchSimple_caller<16, 16, 64, Dist>(
query, train, maxDistance, mask,
static_cast<DevMem2Di>(trainIdx), static_cast<DevMem2Df>(distance), (unsigned int*)nMatches.data,
stream);
}
}
///////////////////////////////////////////////////////////////////////////////
// Radius Match caller
...
...
@@ -133,13 +243,13 @@ namespace cv { namespace gpu { namespace bfmatcher
if (mask.data)
{
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
trainIdx,
nMatches, distance
,
trainIdx,
distance, nMatches
,
stream);
}
else
{
radiusMatchDispatcher< L1Dist<T> >(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
trainIdx,
nMatches, distance
,
trainIdx,
distance, nMatches
,
stream);
}
}
...
...
@@ -158,13 +268,13 @@ namespace cv { namespace gpu { namespace bfmatcher
if (mask.data)
{
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
trainIdx,
nMatches, distance
,
trainIdx,
distance, nMatches
,
stream);
}
else
{
radiusMatchDispatcher<L2Dist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
trainIdx,
nMatches, distance
,
trainIdx,
distance, nMatches
,
stream);
}
}
...
...
@@ -183,13 +293,13 @@ namespace cv { namespace gpu { namespace bfmatcher
if (mask.data)
{
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, SingleMask(mask),
trainIdx,
nMatches, distance
,
trainIdx,
distance, nMatches
,
stream);
}
else
{
radiusMatchDispatcher<HammingDist>(static_cast< DevMem2D_<T> >(query), static_cast< DevMem2D_<T> >(train), maxDistance, WithOutMask(),
trainIdx,
nMatches, distance
,
trainIdx,
distance, nMatches
,
stream);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment