Commit 8891acb6 authored by Vladislav Vinogradov's avatar Vladislav Vinogradov

added BruteForceMatcher_GPU

parent 77027f60
set(name "gpu")
set(DEPS "opencv_core" "opencv_imgproc" "opencv_objdetect")
set(DEPS "opencv_core" "opencv_imgproc" "opencv_objdetect" "opencv_features2d" "opencv_flann")
set(OPENCV_LINKER_LIBS ${OPENCV_LINKER_LIBS} opencv_gpu)
......
......@@ -48,6 +48,7 @@
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/objdetect/objdetect.hpp"
#include "opencv2/gpu/devmem2d.hpp"
#include "opencv2/features2d/features2d.hpp"
namespace cv
{
......@@ -1118,7 +1119,152 @@ namespace cv
// Gradients conputation results
GpuMat grad, qangle;
};
};
////////////////////////////////// BruteForceMatcher //////////////////////////////////
class CV_EXPORTS BruteForceMatcher_GPU_base
{
public:
enum DistType {L1Dist = 0, L2Dist};
explicit BruteForceMatcher_GPU_base(DistType distType = L2Dist);
// Add descriptors to train descriptor collection.
void add(const std::vector<GpuMat>& descCollection);
// Get train descriptors collection.
const std::vector<GpuMat>& getTrainDescriptors() const;
// Clear train descriptors collection.
void clear();
// Return true if there are not train descriptors in collection.
bool empty() const;
// Return true if the matcher supports mask in match methods.
bool isMaskSupported() const;
// Find one best match for each query descriptor.
// trainIdx.at<int>(0, queryIdx) will contain best train index for queryIdx
// distance.at<float>(0, queryIdx) will contain distance
void matchSingle(const GpuMat& queryDescs, const GpuMat& trainDescs,
GpuMat& trainIdx, GpuMat& distance,
const GpuMat& mask = GpuMat());
// Download trainIdx and distance to CPU vector with DMatch
static void matchDownload(const GpuMat& trainIdx, const GpuMat& distance, std::vector<DMatch>& matches);
// Find one best match for each query descriptor.
void match(const GpuMat& queryDescs, const GpuMat& trainDescs, std::vector<DMatch>& matches,
const GpuMat& mask = GpuMat());
// Make gpu collection of trains and masks in suitable format for matchCollection function
void makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection,
const vector<GpuMat>& masks = std::vector<GpuMat>());
// Find one best match from train collection for each query descriptor.
// trainIdx.at<int>(0, queryIdx) will contain best train index for queryIdx
// imgIdx.at<int>(0, queryIdx) will contain best image index for queryIdx
// distance.at<float>(0, queryIdx) will contain distance
void matchCollection(const GpuMat& queryDescs, const GpuMat& trainCollection,
GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance,
const GpuMat& maskCollection);
// Download trainIdx, imgIdx and distance to CPU vector with DMatch
static void matchDownload(const GpuMat& trainIdx, GpuMat& imgIdx, const GpuMat& distance,
std::vector<DMatch>& matches);
// Find one best match from train collection for each query descriptor.
void match(const GpuMat& queryDescs, std::vector<DMatch>& matches,
const std::vector<GpuMat>& masks = std::vector<GpuMat>());
// Find k best matches for each query descriptor (in increasing order of distances).
// trainIdx.at<int>(queryIdx, i) will contain index of i'th best trains (i < k).
// distance.at<float>(queryIdx, i) will contain distance.
// allDist is a buffer to store all distance between query descriptors and train descriptors
// it have size (nQuery,nTrain) and CV_32F type
// allDist.at<float>(queryIdx, trainIdx) will contain FLT_MAX, if trainIdx is one from k best,
// otherwise it will contain distance between queryIdx and trainIdx descriptors
void knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs,
GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask = GpuMat());
// Download trainIdx and distance to CPU vector with DMatch
// compactResult is used when mask is not empty. If compactResult is false matches
// vector will have the same size as queryDescriptors rows. If compactResult is true
// matches vector will not contain matches for fully masked out query descriptors.
static void knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
// Find k best matches for each query descriptor (in increasing order of distances).
// compactResult is used when mask is not empty. If compactResult is false matches
// vector will have the same size as queryDescriptors rows. If compactResult is true
// matches vector will not contain matches for fully masked out query descriptors.
void knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs,
std::vector< std::vector<DMatch> >& matches, int k, const GpuMat& mask = GpuMat(),
bool compactResult = false);
// Find k best matches for each query descriptor (in increasing order of distances).
// compactResult is used when mask is not empty. If compactResult is false matches
// vector will have the same size as queryDescriptors rows. If compactResult is true
// matches vector will not contain matches for fully masked out query descriptors.
void knnMatch(const GpuMat& queryDescs, std::vector< std::vector<DMatch> >& matches, int knn,
const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false );
// Find best matches for each query descriptor which have distance less than maxDistance.
// nMatches.at<unsigned int>(0, queruIdx) will contain matches count for queryIdx.
// carefully nMatches can be greater than trainIdx.cols - it means that matcher didn't find all matches,
// because it didn't have enough memory.
// trainIdx.at<int>(queruIdx, i) will contain ith train index (i < min(nMatches.at<unsigned int>(0, queruIdx), trainIdx.cols))
// distance.at<int>(queruIdx, i) will contain ith distance (i < min(nMatches.at<unsigned int>(0, queruIdx), trainIdx.cols))
// If trainIdx is empty, then trainIdx and distance will be created with size nQuery x nTrain,
// otherwize user can pass own allocated trainIdx and distance with size nQuery x nMaxMatches
// Matches doesn't sorted.
void radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs,
GpuMat& trainIdx, GpuMat& nMatches, GpuMat& distance, float maxDistance,
const GpuMat& mask = GpuMat());
// Download trainIdx, nMatches and distance to CPU vector with DMatch.
// matches will be sorted in increasing order of distances.
// compactResult is used when mask is not empty. If compactResult is false matches
// vector will have the same size as queryDescriptors rows. If compactResult is true
// matches vector will not contain matches for fully masked out query descriptors.
static void radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& nMatches, const GpuMat& distance,
std::vector< std::vector<DMatch> >& matches, bool compactResult = false);
// Find best matches for each query descriptor which have distance less than maxDistance
// in increasing order of distances).
void radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs,
std::vector< std::vector<DMatch> >& matches, float maxDistance,
const GpuMat& mask = GpuMat(), bool compactResult = false);
// Find best matches from train collection for each query descriptor which have distance less than
// maxDistance (in increasing order of distances).
void radiusMatch(const GpuMat& queryDescs, std::vector< std::vector<DMatch> >& matches, float maxDistance,
const std::vector<GpuMat>& masks = std::vector<GpuMat>(), bool compactResult = false);
private:
DistType distType;
std::vector<GpuMat> trainDescCollection;
};
template <class Distance>
class CV_EXPORTS BruteForceMatcher_GPU;
template <typename T>
class CV_EXPORTS BruteForceMatcher_GPU< L1<T> > : public BruteForceMatcher_GPU_base
{
public:
explicit BruteForceMatcher_GPU(L1<T> d = L1<T>()) : BruteForceMatcher_GPU_base(L1Dist) {}
};
template <typename T>
class CV_EXPORTS BruteForceMatcher_GPU< L2<T> > : public BruteForceMatcher_GPU_base
{
public:
explicit BruteForceMatcher_GPU(L2<T> d = L2<T>()) : BruteForceMatcher_GPU_base(L2Dist) {}
};
}
......
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other GpuMaterials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or bpied warranties, including, but not limited to, the bpied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "precomp.hpp"
using namespace cv;
using namespace cv::gpu;
using namespace std;
#if !defined (HAVE_CUDA)
cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::add(const vector<GpuMat>&) { throw_nogpu(); }
const vector<GpuMat>& cv::gpu::BruteForceMatcher_GPU_base::getTrainDescriptors() const { throw_nogpu(); return trainDescCollection; }
void cv::gpu::BruteForceMatcher_GPU_base::clear() { throw_nogpu(); }
bool cv::gpu::BruteForceMatcher_GPU_base::empty() const { throw_nogpu(); return true; }
bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const { throw_nogpu(); return true; }
void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, const GpuMat&, vector<DMatch>&) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, const GpuMat&, vector<DMatch>&, const GpuMat&) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat&, GpuMat&, const vector<GpuMat>&) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, const GpuMat&) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat&, GpuMat&, const GpuMat&, std::vector<DMatch>&) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat&, std::vector<DMatch>&, const std::vector<GpuMat>&) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, int, const GpuMat&) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, int, const GpuMat&, bool) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat&, std::vector< std::vector<DMatch> >&, int, const std::vector<GpuMat>&, bool) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, GpuMat&, GpuMat&, GpuMat&, float, const GpuMat&) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat&, const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, bool) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, const GpuMat&, std::vector< std::vector<DMatch> >&, float, const GpuMat&, bool) { throw_nogpu(); }
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat&, std::vector< std::vector<DMatch> >&, float, const std::vector<GpuMat>&, bool) { throw_nogpu(); }
#else /* !defined (HAVE_CUDA) */
namespace cv { namespace gpu { namespace bfmatcher
{
template <typename T>
void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template <typename T>
void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template <typename T>
void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
const DevMem2Df& distance);
template <typename T>
void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
const DevMem2Df& distance);
template <typename T>
void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template <typename T>
void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template <typename T>
void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template <typename T>
void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
}}}
cv::gpu::BruteForceMatcher_GPU_base::BruteForceMatcher_GPU_base(DistType distType_) : distType(distType_)
{
}
////////////////////////////////////////////////////////////////////
// Train collection
void cv::gpu::BruteForceMatcher_GPU_base::add(const vector<GpuMat>& descCollection)
{
trainDescCollection.insert(trainDescCollection.end(), descCollection.begin(), descCollection.end());
}
const vector<GpuMat>& cv::gpu::BruteForceMatcher_GPU_base::getTrainDescriptors() const
{
return trainDescCollection;
}
void cv::gpu::BruteForceMatcher_GPU_base::clear()
{
trainDescCollection.clear();
}
bool cv::gpu::BruteForceMatcher_GPU_base::empty() const
{
return trainDescCollection.empty();
}
bool cv::gpu::BruteForceMatcher_GPU_base::isMaskSupported() const
{
return true;
}
////////////////////////////////////////////////////////////////////
// Match
void cv::gpu::BruteForceMatcher_GPU_base::matchSingle(const GpuMat& queryDescs, const GpuMat& trainDescs,
GpuMat& trainIdx, GpuMat& distance, const GpuMat& mask)
{
using namespace cv::gpu::bfmatcher;
typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
static const match_caller_t match_callers[2][8] =
{
{
matchSingleL1_gpu<unsigned char>, matchSingleL1_gpu<char>, matchSingleL1_gpu<unsigned short>,
matchSingleL1_gpu<short>, matchSingleL1_gpu<int>, matchSingleL1_gpu<float>, 0, 0
},
{
matchSingleL2_gpu<unsigned char>, matchSingleL2_gpu<char>, matchSingleL2_gpu<unsigned short>,
matchSingleL2_gpu<short>, matchSingleL2_gpu<int>, matchSingleL2_gpu<float>, 0, 0
}
};
CV_Assert(queryDescs.channels() == 1);
CV_Assert(trainDescs.cols == queryDescs.cols && trainDescs.type() == queryDescs.type());
const int nQuery = queryDescs.rows;
trainIdx.create(1, nQuery, CV_32S);
distance.create(1, nQuery, CV_32F);
match_caller_t func = match_callers[distType][queryDescs.depth()];
CV_Assert(func != 0);
// For single train there is no need to save imgIdx, so we just save imgIdx to trainIdx.
// trainIdx store after imgIdx, so we doesn't lose it value.
func(queryDescs, trainDescs, mask, trainIdx, trainIdx, distance);
}
void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, const GpuMat& distance,
vector<DMatch>& matches)
{
const int nQuery = trainIdx.cols;
Mat trainIdxCPU = trainIdx;
Mat distanceCPU = distance;
matches.clear();
matches.reserve(nQuery);
const int* trainIdx_ptr = trainIdxCPU.ptr<int>();
const float* distance_ptr = distanceCPU.ptr<float>();
for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++distance_ptr)
{
int trainIdx = *trainIdx_ptr;
if (trainIdx == -1)
continue;
float distance = *distance_ptr;
DMatch m(queryIdx, trainIdx, 0, distance);
matches.push_back(m);
}
}
void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& queryDescs, const GpuMat& trainDescs,
vector<DMatch>& matches, const GpuMat& mask)
{
GpuMat trainIdx, distance;
matchSingle(queryDescs, trainDescs, trainIdx, distance, mask);
matchDownload(trainIdx, distance, matches);
}
void cv::gpu::BruteForceMatcher_GPU_base::makeGpuCollection(GpuMat& trainCollection, GpuMat& maskCollection,
const vector<GpuMat>& masks)
{
if (masks.empty())
{
Mat trainCollectionCPU(1, trainDescCollection.size(), CV_8UC(sizeof(DevMem2D)));
for (size_t i = 0; i < trainDescCollection.size(); ++i)
{
const GpuMat& trainDescs = trainDescCollection[i];
trainCollectionCPU.ptr<DevMem2D>(0)[i] = trainDescs;
}
trainCollection.upload(trainCollectionCPU);
}
else
{
CV_Assert(masks.size() == trainDescCollection.size());
Mat trainCollectionCPU(1, trainDescCollection.size(), CV_8UC(sizeof(DevMem2D)));
Mat maskCollectionCPU(1, trainDescCollection.size(), CV_8UC(sizeof(PtrStep)));
for (size_t i = 0; i < trainDescCollection.size(); ++i)
{
const GpuMat& trainDescs = trainDescCollection[i];
const GpuMat& mask = masks[i];
CV_Assert(mask.empty() || (mask.type() == CV_8UC1));
trainCollectionCPU.ptr<DevMem2D>(0)[i] = trainDescs;
maskCollectionCPU.ptr<PtrStep>(0)[i] = static_cast<PtrStep>(mask);
}
trainCollection.upload(trainCollectionCPU);
maskCollection.upload(maskCollectionCPU);
}
}
void cv::gpu::BruteForceMatcher_GPU_base::matchCollection(const GpuMat& queryDescs, const GpuMat& trainCollection,
GpuMat& trainIdx, GpuMat& imgIdx, GpuMat& distance, const GpuMat& maskCollection)
{
using namespace cv::gpu::bfmatcher;
typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx,
const DevMem2Df& distance);
static const match_caller_t match_callers[2][8] =
{
{
matchCollectionL1_gpu<unsigned char>, matchCollectionL1_gpu<char>,
matchCollectionL1_gpu<unsigned short>, matchCollectionL1_gpu<short>,
matchCollectionL1_gpu<int>, matchCollectionL1_gpu<float>, 0, 0
},
{
matchCollectionL2_gpu<unsigned char>, matchCollectionL2_gpu<char>,
matchCollectionL2_gpu<unsigned short>, matchCollectionL2_gpu<short>,
matchCollectionL2_gpu<int>, matchCollectionL2_gpu<float>, 0, 0
}
};
CV_Assert(queryDescs.channels() == 1);
const int nQuery = queryDescs.rows;
trainIdx.create(1, nQuery, CV_32S);
imgIdx.create(1, nQuery, CV_32S);
distance.create(1, nQuery, CV_32F);
match_caller_t func = match_callers[distType][queryDescs.depth()];
CV_Assert(func != 0);
func(queryDescs, trainCollection, maskCollection, trainIdx, imgIdx, distance);
}
void cv::gpu::BruteForceMatcher_GPU_base::matchDownload(const GpuMat& trainIdx, GpuMat& imgIdx,
const GpuMat& distance, vector<DMatch>& matches)
{
const int nQuery = trainIdx.cols;
Mat trainIdxCPU = trainIdx;
Mat imgIdxCPU = imgIdx;
Mat distanceCPU = distance;
matches.clear();
matches.reserve(nQuery);
const int* trainIdx_ptr = trainIdxCPU.ptr<int>();
const int* imgIdx_ptr = imgIdxCPU.ptr<int>();
const float* distance_ptr = distanceCPU.ptr<float>();
for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx, ++trainIdx_ptr, ++imgIdx_ptr, ++distance_ptr)
{
int trainIdx = *trainIdx_ptr;
if (trainIdx == -1)
continue;
int imgIdx = *imgIdx_ptr;
float distance = *distance_ptr;
DMatch m(queryIdx, trainIdx, imgIdx, distance);
matches.push_back(m);
}
}
void cv::gpu::BruteForceMatcher_GPU_base::match(const GpuMat& queryDescs, vector<DMatch>& matches,
const vector<GpuMat>& masks)
{
GpuMat trainCollection;
GpuMat maskCollection;
makeGpuCollection(trainCollection, maskCollection, masks);
GpuMat trainIdx, imgIdx, distance;
matchCollection(queryDescs, trainCollection, trainIdx, imgIdx, distance, maskCollection);
matchDownload(trainIdx, imgIdx, distance, matches);
}
////////////////////////////////////////////////////////////////////
// KnnMatch
void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs,
GpuMat& trainIdx, GpuMat& distance, GpuMat& allDist, int k, const GpuMat& mask)
{
using namespace cv::gpu::bfmatcher;
typedef void (*match_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
static const match_caller_t match_callers[2][8] =
{
{
knnMatchL1_gpu<unsigned char>, knnMatchL1_gpu<char>, knnMatchL1_gpu<unsigned short>,
knnMatchL1_gpu<short>, knnMatchL1_gpu<int>, knnMatchL1_gpu<float>, 0, 0
},
{
knnMatchL2_gpu<unsigned char>, knnMatchL2_gpu<char>, knnMatchL2_gpu<unsigned short>,
knnMatchL2_gpu<short>, knnMatchL2_gpu<int>, knnMatchL2_gpu<float>, 0, 0
}
};
CV_Assert(queryDescs.channels() == 1);
const int nQuery = queryDescs.rows;
const int nTrain = trainDescs.rows;
trainIdx.create(nQuery, k, CV_32S);
trainIdx.setTo(Scalar::all(-1));
distance.create(nQuery, k, CV_32F);
allDist.create(nQuery, nTrain, CV_32F);
match_caller_t func = match_callers[distType][queryDescs.depth()];
CV_Assert(func != 0);
func(queryDescs, trainDescs, k, mask, trainIdx, distance, allDist);
}
void cv::gpu::BruteForceMatcher_GPU_base::knnMatchDownload(const GpuMat& trainIdx, const GpuMat& distance,
vector< vector<DMatch> >& matches, bool compactResult)
{
const int nQuery = distance.rows;
const int k = trainIdx.cols;
Mat trainIdxCPU = trainIdx;
Mat distanceCPU = distance;
matches.clear();
matches.reserve(nQuery);
for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
{
matches.push_back(vector<DMatch>());
vector<DMatch>& curMatches = matches.back();
curMatches.reserve(k);
int* trainIdx_ptr = trainIdxCPU.ptr<int>(queryIdx);
float* distance_ptr = distanceCPU.ptr<float>(queryIdx);
for (int i = 0; i < k; ++i, ++trainIdx_ptr, ++distance_ptr)
{
int trainIdx = *trainIdx_ptr;
if (trainIdx != -1)
{
float distance = *distance_ptr;
DMatch m(queryIdx, trainIdx, 0, distance);
curMatches.push_back(m);
}
}
if (compactResult && curMatches.empty())
matches.pop_back();
}
}
void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs, const GpuMat& trainDescs,
vector< vector<DMatch> >& matches, int k, const GpuMat& mask, bool compactResult)
{
GpuMat trainIdx, distance, allDist;
knnMatch(queryDescs, trainDescs, trainIdx, distance, allDist, k, mask);
knnMatchDownload(trainIdx, distance, matches, compactResult);
}
namespace
{
class ImgIdxSetter
{
public:
ImgIdxSetter(int imgIdx_) : imgIdx(imgIdx_) {}
void operator()(DMatch& m) const {m.imgIdx = imgIdx;}
private:
int imgIdx;
};
}
void cv::gpu::BruteForceMatcher_GPU_base::knnMatch(const GpuMat& queryDescs,
vector< vector<DMatch> >& matches, int knn, const vector<GpuMat>& masks, bool compactResult)
{
vector< vector<DMatch> > curMatches;
vector<DMatch> temp;
temp.reserve(2 * knn);
matches.resize(queryDescs.rows);
for_each(matches.begin(), matches.end(), bind2nd(mem_fun_ref(&vector<DMatch>::reserve), knn));
for (size_t imgIdx = 0; imgIdx < trainDescCollection.size(); ++imgIdx)
{
knnMatch(queryDescs, trainDescCollection[imgIdx], curMatches, knn,
masks.empty() ? GpuMat() : masks[imgIdx]);
for (int queryIdx = 0; queryIdx < queryDescs.rows; ++queryIdx)
{
vector<DMatch>& localMatch = curMatches[queryIdx];
vector<DMatch>& globalMatch = matches[queryIdx];
for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(imgIdx));
temp.clear();
merge(globalMatch.begin(), globalMatch.end(), localMatch.begin(), localMatch.end(), back_inserter(temp));
globalMatch.clear();
const size_t count = std::min((size_t)knn, temp.size());
copy(temp.begin(), temp.begin() + count, back_inserter(globalMatch));
}
}
if (compactResult)
{
vector< vector<DMatch> >::iterator new_end = remove_if(matches.begin(), matches.end(),
mem_fun_ref(&vector<DMatch>::empty));
matches.erase(new_end, matches.end());
}
}
////////////////////////////////////////////////////////////////////
// RadiusMatch
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs,
GpuMat& trainIdx, GpuMat& nMatches, GpuMat& distance, float maxDistance, const GpuMat& mask)
{
using namespace cv::gpu::bfmatcher;
typedef void (*radiusMatch_caller_t)(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
static const radiusMatch_caller_t radiusMatch_callers[2][8] =
{
{
radiusMatchL1_gpu<unsigned char>, radiusMatchL1_gpu<char>, radiusMatchL1_gpu<unsigned short>,
radiusMatchL1_gpu<short>, radiusMatchL1_gpu<int>, radiusMatchL1_gpu<float>, 0, 0
},
{
radiusMatchL2_gpu<unsigned char>, radiusMatchL2_gpu<char>, radiusMatchL2_gpu<unsigned short>,
radiusMatchL2_gpu<short>, radiusMatchL2_gpu<int>, radiusMatchL2_gpu<float>, 0, 0
}
};
const int nQuery = queryDescs.rows;
const int nTrain = trainDescs.rows;
CV_Assert(queryDescs.channels() == 1);
CV_Assert(trainDescs.type() == queryDescs.type() && trainDescs.cols == queryDescs.cols);
CV_Assert(trainIdx.empty() || trainIdx.rows == nQuery);
nMatches.create(1, nQuery, CV_32SC1);
nMatches.setTo(Scalar::all(0));
if (trainIdx.empty())
{
trainIdx.create(nQuery, nTrain, CV_32SC1);
distance.create(nQuery, nTrain, CV_32FC1);
}
radiusMatch_caller_t func = radiusMatch_callers[distType][queryDescs.depth()];
CV_Assert(func != 0);
func(queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches.ptr<unsigned int>(), distance);
}
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatchDownload(const GpuMat& trainIdx, const GpuMat& nMatches,
const GpuMat& distance, std::vector< std::vector<DMatch> >& matches, bool compactResult)
{
const int nQuery = trainIdx.rows;
Mat trainIdxCPU = trainIdx;
Mat nMatchesCPU = nMatches;
Mat distanceCPU = distance;
matches.clear();
matches.reserve(nQuery);
const unsigned int* nMatches_ptr = nMatchesCPU.ptr<unsigned int>();
for (int queryIdx = 0; queryIdx < nQuery; ++queryIdx)
{
const int* trainIdx_ptr = trainIdxCPU.ptr<int>(queryIdx);
const float* distance_ptr = distanceCPU.ptr<float>(queryIdx);
const int nMatches = std::min(static_cast<int>(nMatches_ptr[queryIdx]), trainIdx.cols);
if (nMatches == 0)
{
if (!compactResult)
matches.push_back(vector<DMatch>());
continue;
}
matches.push_back(vector<DMatch>());
vector<DMatch>& curMatches = matches.back();
curMatches.reserve(nMatches);
for (int i = 0; i < nMatches; ++i, ++trainIdx_ptr, ++distance_ptr)
{
int trainIdx = *trainIdx_ptr;
float distance = *distance_ptr;
DMatch m(queryIdx, trainIdx, 0, distance);
curMatches.push_back(m);
}
sort(curMatches.begin(), curMatches.end());
}
}
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, const GpuMat& trainDescs,
vector< vector<DMatch> >& matches, float maxDistance, const GpuMat& mask, bool compactResult)
{
GpuMat trainIdx, nMatches, distance;
radiusMatch(queryDescs, trainDescs, trainIdx, nMatches, distance, maxDistance, mask);
radiusMatchDownload(trainIdx, nMatches, distance, matches, compactResult);
}
void cv::gpu::BruteForceMatcher_GPU_base::radiusMatch(const GpuMat& queryDescs, vector< vector<DMatch> >& matches,
float maxDistance, const vector<GpuMat>& masks, bool compactResult)
{
matches.resize(queryDescs.rows);
vector< vector<DMatch> > curMatches;
for (size_t imgIdx = 0; imgIdx < trainDescCollection.size(); ++imgIdx)
{
radiusMatch(queryDescs, trainDescCollection[imgIdx], curMatches, maxDistance,
masks.empty() ? GpuMat() : masks[imgIdx]);
for (int queryIdx = 0; queryIdx < queryDescs.rows; ++queryIdx)
{
vector<DMatch>& localMatch = curMatches[queryIdx];
vector<DMatch>& globalMatch = matches[queryIdx];
for_each(localMatch.begin(), localMatch.end(), ImgIdxSetter(imgIdx));
const size_t oldSize = globalMatch.size();
copy(localMatch.begin(), localMatch.end(), back_inserter(globalMatch));
inplace_merge(globalMatch.begin(), globalMatch.begin() + oldSize, globalMatch.end());
}
}
if (compactResult)
{
vector< vector<DMatch> >::iterator new_end = remove_if(matches.begin(), matches.end(),
mem_fun_ref(&vector<DMatch>::empty));
matches.erase(new_end, matches.end());
}
}
#endif /* !defined (HAVE_CUDA) */
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of the copyright holders may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or bpied warranties, including, but not limited to, the bpied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "cuda_shared.hpp"
#include "limits_gpu.hpp"
using namespace cv::gpu;
using namespace cv::gpu::device;
namespace cv { namespace gpu { namespace bfmatcher
{
///////////////////////////////////////////////////////////////////////////////////
////////////////////////////////// General funcs //////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
template <bool expr> struct StaticAssert;
template <> struct StaticAssert<true> {static __host__ __device__ void check(){}};
///////////////////////////////////////////////////////////////////////////////
// Mask strategy
class SingleMask
{
public:
explicit SingleMask(const PtrStep& mask_) : mask(mask_) {}
__device__ bool operator()(int queryIdx, int trainIdx) const
{
return mask.ptr(queryIdx)[trainIdx] != 0;
}
private:
PtrStep mask;
};
class MaskCollection
{
public:
explicit MaskCollection(PtrStep* maskCollection_) : maskCollection(maskCollection_) {}
__device__ void nextMask()
{
curMask = *maskCollection++;
}
__device__ bool operator()(int queryIdx, int trainIdx) const
{
return curMask.data == 0 || curMask.ptr(queryIdx)[trainIdx] != 0;
}
private:
PtrStep* maskCollection;
PtrStep curMask;
};
class WithOutMask
{
public:
__device__ void nextMask()
{
}
__device__ bool operator()(int queryIdx, int trainIdx) const
{
return true;
}
};
///////////////////////////////////////////////////////////////////////////////
// Reduce Sum
template <int BLOCK_DIM_X>
__device__ void reduceSum(float* sdiff, float mySum, int tid)
{
sdiff[tid] = mySum;
__syncthreads();
if (BLOCK_DIM_X == 512)
{
if (tid < 256)
{
sdiff[tid] = mySum += sdiff[tid + 256]; __syncthreads();
sdiff[tid] = mySum += sdiff[tid + 128]; __syncthreads();
sdiff[tid] = mySum += sdiff[tid + 64]; __syncthreads();
}
volatile float* smem = sdiff;
smem[tid] = mySum += smem[tid + 32];
smem[tid] = mySum += smem[tid + 16];
smem[tid] = mySum += smem[tid + 8];
smem[tid] = mySum += smem[tid + 4];
smem[tid] = mySum += smem[tid + 2];
smem[tid] = mySum += smem[tid + 1];
}
if (BLOCK_DIM_X == 256)
{
if (tid < 128)
{
sdiff[tid] = mySum += sdiff[tid + 128]; __syncthreads();
sdiff[tid] = mySum += sdiff[tid + 64]; __syncthreads();
}
volatile float* smem = sdiff;
smem[tid] = mySum += smem[tid + 32];
smem[tid] = mySum += smem[tid + 16];
smem[tid] = mySum += smem[tid + 8];
smem[tid] = mySum += smem[tid + 4];
smem[tid] = mySum += smem[tid + 2];
smem[tid] = mySum += smem[tid + 1];
}
if (BLOCK_DIM_X == 128)
{
if (tid < 64)
{
sdiff[tid] = mySum += sdiff[tid + 64]; __syncthreads();
}
volatile float* smem = sdiff;
smem[tid] = mySum += smem[tid + 32];
smem[tid] = mySum += smem[tid + 16];
smem[tid] = mySum += smem[tid + 8];
smem[tid] = mySum += smem[tid + 4];
smem[tid] = mySum += smem[tid + 2];
smem[tid] = mySum += smem[tid + 1];
}
volatile float* smem = sdiff;
if (BLOCK_DIM_X == 64)
{
if (tid < 32)
{
smem[tid] = mySum += smem[tid + 32];
smem[tid] = mySum += smem[tid + 16];
smem[tid] = mySum += smem[tid + 8];
smem[tid] = mySum += smem[tid + 4];
smem[tid] = mySum += smem[tid + 2];
smem[tid] = mySum += smem[tid + 1];
}
}
if (BLOCK_DIM_X == 32)
{
if (tid < 16)
{
smem[tid] = mySum += smem[tid + 16];
smem[tid] = mySum += smem[tid + 8];
smem[tid] = mySum += smem[tid + 4];
smem[tid] = mySum += smem[tid + 2];
smem[tid] = mySum += smem[tid + 1];
}
}
if (BLOCK_DIM_X == 16)
{
if (tid < 8)
{
smem[tid] = mySum += smem[tid + 8];
smem[tid] = mySum += smem[tid + 4];
smem[tid] = mySum += smem[tid + 2];
smem[tid] = mySum += smem[tid + 1];
}
}
if (BLOCK_DIM_X == 8)
{
if (tid < 4)
{
smem[tid] = mySum += smem[tid + 4];
smem[tid] = mySum += smem[tid + 2];
smem[tid] = mySum += smem[tid + 1];
}
}
if (BLOCK_DIM_X == 4)
{
if (tid < 2)
{
smem[tid] = mySum += smem[tid + 2];
smem[tid] = mySum += smem[tid + 1];
}
}
if (BLOCK_DIM_X == 2)
{
if (tid < 1)
{
smem[tid] = mySum += smem[tid + 1];
}
}
}
///////////////////////////////////////////////////////////////////////////////
// loadDescsVals
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, typename T>
__device__ void loadDescsVals(const T* descs, int desc_len, float* smem, float* queryVals)
{
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if (tid < desc_len)
{
smem[tid] = (float)descs[tid];
}
__syncthreads();
#pragma unroll
for (int i = threadIdx.x; i < MAX_DESCRIPTORS_LEN; i += BLOCK_DIM_X)
{
*queryVals = smem[i];
++queryVals;
}
}
///////////////////////////////////////////////////////////////////////////////
// Distance
template <int BLOCK_DIM_X>
class L1Dist
{
public:
__device__ L1Dist() : mySum(0) {}
__device__ void reduceIter(float val1, float val2)
{
mySum += fabs(val1 - val2);
}
__device__ void reduceAll(float* sdiff, int tid)
{
reduceSum<BLOCK_DIM_X>(sdiff, mySum, tid);
}
static __device__ float finalResult(float res)
{
return res;
}
private:
float mySum;
};
template <int BLOCK_DIM_X>
class L2Dist
{
public:
__device__ L2Dist() : mySum(0) {}
__device__ void reduceIter(float val1, float val2)
{
float reg = val1 - val2;
mySum += reg * reg;
}
__device__ void reduceAll(float* sdiff, int tid)
{
reduceSum<BLOCK_DIM_X>(sdiff, mySum, tid);
}
static __device__ float finalResult(float res)
{
return sqrtf(res);
}
private:
float mySum;
};
///////////////////////////////////////////////////////////////////////////////
// reduceDescDiff
template <int BLOCK_DIM_X, typename Dist, typename T>
__device__ void reduceDescDiff(const T* queryDescs, const T* trainDescs, int desc_len, float* sdiff)
{
const int tid = threadIdx.x;
Dist dist;
for (int i = tid; i < desc_len; i += BLOCK_DIM_X)
dist.reduceIter(queryDescs[i], trainDescs[i]);
dist.reduceAll(sdiff, tid);
}
///////////////////////////////////////////////////////////////////////////////
// reduceDescDiff_smem
template <int N> struct UnrollDescDiff
{
template <typename Dist, typename T>
static __device__ void calcCheck(Dist& dist, const float* queryVals, const T* trainDescs,
int ind, int desc_len)
{
if (ind < desc_len)
dist.reduceIter(*queryVals, trainDescs[ind]);
++queryVals;
UnrollDescDiff<N - 1>::calcCheck(dist, queryVals, trainDescs, ind + blockDim.x, desc_len);
}
template <typename Dist, typename T>
static __device__ void calcWithoutCheck(Dist& dist, const float* queryVals, const T* trainDescs)
{
dist.reduceIter(*queryVals, *trainDescs);
++queryVals;
trainDescs += blockDim.x;
UnrollDescDiff<N - 1>::calcWithoutCheck(dist, queryVals, trainDescs);
}
};
template <> struct UnrollDescDiff<0>
{
template <typename Dist, typename T>
static __device__ void calcCheck(Dist& dist, const float* queryVals, const T* trainDescs,
int ind, int desc_len)
{
}
template <typename Dist, typename T>
static __device__ void calcWithoutCheck(Dist& dist, const float* queryVals, const T* trainDescs)
{
}
};
template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool WITH_OUT_CHECK> struct DescDiffCalculator;
template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN>
struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, false>
{
template <typename Dist, typename T>
static __device__ void calc(Dist& dist, const float* queryVals, const T* trainDescs, int desc_len)
{
UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcCheck(dist, queryVals, trainDescs,
threadIdx.x, desc_len);
}
};
template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN>
struct DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, true>
{
template <typename Dist, typename T>
static __device__ void calc(Dist& dist, const float* queryVals, const T* trainDescs, int desc_len)
{
UnrollDescDiff<MAX_DESCRIPTORS_LEN / BLOCK_DIM_X>::calcWithoutCheck(dist, queryVals,
trainDescs + threadIdx.x);
}
};
template <int BLOCK_DIM_X, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN, typename Dist, typename T>
__device__ void reduceDescDiff_smem(const float* queryVals, const T* trainDescs, int desc_len, float* sdiff)
{
const int tid = threadIdx.x;
Dist dist;
DescDiffCalculator<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN>::calc(dist, queryVals,
trainDescs, desc_len);
dist.reduceAll(sdiff, tid);
}
///////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////// Match //////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// warpReduceMin
template <int BLOCK_DIM_Y>
__device__ void warpReduceMin(int tid, volatile float* sdata, volatile int* strainIdx, volatile int* simgIdx)
{
float minSum = sdata[tid];
if (BLOCK_DIM_Y >= 64)
{
float reg = sdata[tid + 32];
if (reg < minSum)
{
sdata[tid] = minSum = reg;
strainIdx[tid] = strainIdx[tid + 32];
simgIdx[tid] = simgIdx[tid + 32];
}
}
if (BLOCK_DIM_Y >= 32)
{
float reg = sdata[tid + 16];
if (reg < minSum)
{
sdata[tid] = minSum = reg;
strainIdx[tid] = strainIdx[tid + 16];
simgIdx[tid] = simgIdx[tid + 16];
}
}
if (BLOCK_DIM_Y >= 16)
{
float reg = sdata[tid + 8];
if (reg < minSum)
{
sdata[tid] = minSum = reg;
strainIdx[tid] = strainIdx[tid + 8];
simgIdx[tid] = simgIdx[tid + 8];
}
}
if (BLOCK_DIM_Y >= 8)
{
float reg = sdata[tid + 4];
if (reg < minSum)
{
sdata[tid] = minSum = reg;
strainIdx[tid] = strainIdx[tid + 4];
simgIdx[tid] = simgIdx[tid + 4];
}
}
if (BLOCK_DIM_Y >= 4)
{
float reg = sdata[tid + 2];
if (reg < minSum)
{
sdata[tid] = minSum = reg;
strainIdx[tid] = strainIdx[tid + 2];
simgIdx[tid] = simgIdx[tid + 2];
}
}
if (BLOCK_DIM_Y >= 2)
{
float reg = sdata[tid + 1];
if (reg < minSum)
{
sdata[tid] = minSum = reg;
strainIdx[tid] = strainIdx[tid + 1];
simgIdx[tid] = simgIdx[tid + 1];
}
}
}
///////////////////////////////////////////////////////////////////////////////
// findBestMatch
template <int BLOCK_DIM_Y, typename Dist>
__device__ void findBestMatch(int queryIdx, float myMin, int myBestTrainIdx, int myBestImgIdx,
float* smin, int* strainIdx, int* simgIdx, int* trainIdx, int* imgIdx, float* distance)
{
if (threadIdx.x == 0)
{
smin[threadIdx.y] = myMin;
strainIdx[threadIdx.y] = myBestTrainIdx;
simgIdx[threadIdx.y] = myBestImgIdx;
}
__syncthreads();
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
if (tid < 32)
warpReduceMin<BLOCK_DIM_Y>(tid, smin, strainIdx, simgIdx);
if (threadIdx.x == 0 && threadIdx.y == 0)
{
float minSum = smin[0];
int bestTrainIdx = strainIdx[0];
int bestImgIdx = simgIdx[0];
imgIdx[queryIdx] = bestImgIdx;
trainIdx[queryIdx] = bestTrainIdx;
distance[queryIdx] = Dist::finalResult(minSum);
}
}
///////////////////////////////////////////////////////////////////////////////
// ReduceDescCalculator
template <int BLOCK_DIM_X, typename Dist, typename T>
class ReduceDescCalculatorSimple
{
public:
__device__ void prepare(const T* queryDescs_, int, float*)
{
queryDescs = queryDescs_;
}
__device__ void calc(const T* trainDescs, int desc_len, float* sdiff_row) const
{
reduceDescDiff<BLOCK_DIM_X, Dist>(queryDescs, trainDescs, desc_len, sdiff_row);
}
private:
const T* queryDescs;
};
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN,
typename Dist, typename T>
class ReduceDescCalculatorSmem
{
public:
__device__ void prepare(const T* queryDescs, int desc_len, float* smem)
{
loadDescsVals<BLOCK_DIM_X, BLOCK_DIM_Y, MAX_DESCRIPTORS_LEN>(queryDescs, desc_len, smem, queryVals);
}
__device__ void calc(const T* trainDescs, int desc_len, float* sdiff_row) const
{
reduceDescDiff_smem<BLOCK_DIM_X, MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, Dist>(queryVals, trainDescs,
desc_len, sdiff_row);
}
private:
float queryVals[MAX_DESCRIPTORS_LEN / BLOCK_DIM_X];
};
///////////////////////////////////////////////////////////////////////////////
// matchDescs loop
template <typename ReduceDescCalculator, typename T, typename Mask>
__device__ void matchDescs(int queryIdx, const int imgIdx, const DevMem2D_<T>& trainDescs_,
const Mask& m, const ReduceDescCalculator& reduceDescCalc,
float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx)
{
const T* trainDescs = trainDescs_.ptr(threadIdx.y);
const int trainDescsStep = blockDim.y * trainDescs_.step / sizeof(T);
for (int trainIdx = threadIdx.y; trainIdx < trainDescs_.rows;
trainIdx += blockDim.y, trainDescs += trainDescsStep)
{
if (m(queryIdx, trainIdx))
{
reduceDescCalc.calc(trainDescs, trainDescs_.cols, sdiff_row);
if (threadIdx.x == 0)
{
float reg = sdiff_row[0];
if (reg < myMin)
{
myMin = reg;
myBestTrainIdx = trainIdx;
myBestImgIdx = imgIdx;
}
}
}
}
}
///////////////////////////////////////////////////////////////////////////////
// Train collection loop strategy
template <typename T>
class SingleTrain
{
public:
explicit SingleTrain(const DevMem2D_<T>& trainDescs_) : trainDescs(trainDescs_)
{
}
template <typename ReduceDescCalculator, typename Mask>
__device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc,
float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx) const
{
matchDescs(queryIdx, 0, trainDescs, m, reduceDescCalc,
sdiff_row, myMin, myBestTrainIdx, myBestImgIdx);
}
__device__ int desc_len() const
{
return trainDescs.cols;
}
private:
DevMem2D_<T> trainDescs;
};
template <typename T>
class TrainCollection
{
public:
TrainCollection(const DevMem2D_<T>* trainCollection_, int nImg_, int desclen_) :
trainCollection(trainCollection_), nImg(nImg_), desclen(desclen_)
{
}
template <typename ReduceDescCalculator, typename Mask>
__device__ void loop(int queryIdx, Mask& m, const ReduceDescCalculator& reduceDescCalc,
float* sdiff_row, float& myMin, int& myBestTrainIdx, int& myBestImgIdx) const
{
for (int imgIdx = 0; imgIdx < nImg; ++imgIdx)
{
DevMem2D_<T> trainDescs = trainCollection[imgIdx];
m.nextMask();
matchDescs(queryIdx, imgIdx, trainDescs, m, reduceDescCalc,
sdiff_row, myMin, myBestTrainIdx, myBestImgIdx);
}
}
__device__ int desc_len() const
{
return desclen;
}
private:
const DevMem2D_<T>* trainCollection;
int nImg;
int desclen;
};
///////////////////////////////////////////////////////////////////////////////
// Match kernel
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename ReduceDescCalculator, typename Dist, typename T,
typename Train, typename Mask>
__global__ void match(PtrStep_<T> queryDescs_, Train train, Mask mask, int* trainIdx, int* imgIdx, float* distance)
{
__shared__ float sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];
__shared__ float smin[64];
__shared__ int strainIdx[64];
__shared__ int simgIdx[64];
const int queryIdx = blockIdx.x;
int myBestTrainIdx = -1;
int myBestImgIdx = -1;
float myMin = numeric_limits_gpu<float>::max();
{
float* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;
Mask m = mask;
ReduceDescCalculator reduceDescCalc;
reduceDescCalc.prepare(queryDescs_.ptr(queryIdx), train.desc_len(), sdiff);
train.loop(queryIdx, m, reduceDescCalc, sdiff_row, myMin, myBestTrainIdx, myBestImgIdx);
}
findBestMatch<BLOCK_DIM_Y, Dist>(queryIdx, myMin, myBestTrainIdx, myBestImgIdx,
smin, strainIdx, simgIdx, trainIdx, imgIdx, distance);
}
///////////////////////////////////////////////////////////////////////////////
// Match kernel callers
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, template <int> class Dist, typename T,
typename Train, typename Mask>
void match_caller(const DevMem2D_<T>& queryDescs, const Train& train,
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
{
StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp
dim3 grid(queryDescs.rows, 1, 1);
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSimple<BLOCK_DIM_X, Dist<BLOCK_DIM_X>, T>,
Dist<BLOCK_DIM_X>, T><<<grid, threads>>>(queryDescs, train, mask, trainIdx.data,
imgIdx.data, distance.data);
cudaSafeCall( cudaThreadSynchronize() );
}
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, int MAX_DESCRIPTORS_LEN, bool DESC_LEN_EQ_MAX_LEN,
template <int> class Dist, typename T, typename Train, typename Mask>
void match_smem_caller(const DevMem2D_<T>& queryDescs, const Train& train,
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
{
StaticAssert<BLOCK_DIM_Y <= 64>::check(); // blockDimY vals must reduce by warp
StaticAssert<BLOCK_DIM_X * BLOCK_DIM_Y >= MAX_DESCRIPTORS_LEN>::check(); // block size must be greter than descriptors length
StaticAssert<MAX_DESCRIPTORS_LEN % BLOCK_DIM_X == 0>::check(); // max descriptors length must divide to blockDimX
dim3 grid(queryDescs.rows, 1, 1);
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
match<BLOCK_DIM_X, BLOCK_DIM_Y, ReduceDescCalculatorSmem<BLOCK_DIM_X, BLOCK_DIM_Y,
MAX_DESCRIPTORS_LEN, DESC_LEN_EQ_MAX_LEN, Dist<BLOCK_DIM_X>, T>,
Dist<BLOCK_DIM_X>, T><<<grid, threads>>>(queryDescs, train, mask, trainIdx.data,
imgIdx.data, distance.data);
cudaSafeCall( cudaThreadSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
// Match kernel chooser
template <template <int> class Dist, typename T, typename Train, typename Mask>
void match_chooser(const DevMem2D_<T>& queryDescs, const Train& train,
const Mask& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
{
if (queryDescs.cols < 64)
match_smem_caller<16, 16, 64, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
else if (queryDescs.cols == 64)
match_smem_caller<16, 16, 64, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
else if (queryDescs.cols < 128)
match_smem_caller<16, 16, 128, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
else if (queryDescs.cols == 128)
match_smem_caller<16, 16, 128, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
else if (queryDescs.cols < 256)
match_smem_caller<16, 16, 256, false, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
else if (queryDescs.cols == 256)
match_smem_caller<16, 16, 256, true, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
else
match_caller<16, 16, Dist>(queryDescs, train, mask, trainIdx, imgIdx, distance);
cudaSafeCall( cudaThreadSynchronize() );
}
template <typename T>
void matchSingleL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
{
SingleTrain<T> train((DevMem2D_<T>)trainDescs);
if (mask.data)
{
SingleMask m(mask);
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance);
}
else
{
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);
}
}
template void matchSingleL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template void matchSingleL1_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template void matchSingleL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template void matchSingleL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template void matchSingleL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template void matchSingleL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template <typename T>
void matchSingleL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
{
SingleTrain<T> train((DevMem2D_<T>)trainDescs);
if (mask.data)
{
SingleMask m(mask);
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, m, trainIdx, imgIdx, distance);
}
else
{
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);
}
}
template void matchSingleL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template void matchSingleL2_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template void matchSingleL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template void matchSingleL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template void matchSingleL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template void matchSingleL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template <typename T>
void matchCollectionL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
{
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);
if (maskCollection.data)
{
MaskCollection mask(maskCollection.data);
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance);
}
else
{
match_chooser<L1Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);
}
}
template void matchCollectionL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template void matchCollectionL1_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template void matchCollectionL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template void matchCollectionL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template void matchCollectionL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template void matchCollectionL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template <typename T>
void matchCollectionL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainCollection,
const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance)
{
TrainCollection<T> train((DevMem2D_<T>*)trainCollection.ptr(), trainCollection.cols, queryDescs.cols);
if (maskCollection.data)
{
MaskCollection mask(maskCollection.data);
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, mask, trainIdx, imgIdx, distance);
}
else
{
match_chooser<L2Dist>((DevMem2D_<T>)queryDescs, train, WithOutMask(), trainIdx, imgIdx, distance);
}
}
template void matchCollectionL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template void matchCollectionL2_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template void matchCollectionL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template void matchCollectionL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template void matchCollectionL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
template void matchCollectionL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainCollection, const DevMem2D_<PtrStep>& maskCollection, const DevMem2Di& trainIdx, const DevMem2Di& imgIdx, const DevMem2Df& distance);
///////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////// Knn Match ////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Calc distance kernel
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
__global__ void calcDistance(PtrStep_<T> queryDescs_, DevMem2D_<T> trainDescs_, Mask mask, PtrStepf distance)
{
__shared__ float sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];
float* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;
const int queryIdx = blockIdx.x;
const T* queryDescs = queryDescs_.ptr(queryIdx);
const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
if (trainIdx < trainDescs_.rows)
{
const T* trainDescs = trainDescs_.ptr(trainIdx);
float dist = numeric_limits_gpu<float>::max();
if (mask(queryIdx, trainIdx))
{
reduceDescDiff<BLOCK_DIM_X, Dist>(queryDescs, trainDescs, trainDescs_.cols, sdiff_row);
if (threadIdx.x == 0)
{
dist = Dist::finalResult(sdiff_row[0]);
}
}
if (threadIdx.x == 0)
distance.ptr(queryIdx)[trainIdx] = dist;
}
}
///////////////////////////////////////////////////////////////////////////////
// Calc distance kernel caller
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, template <int> class Dist, typename T, typename Mask>
void calcDistance_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs,
const Mask& mask, const DevMem2Df& distance)
{
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1);
calcDistance<BLOCK_DIM_X, BLOCK_DIM_Y, Dist<BLOCK_DIM_X>, T><<<grid, threads>>>(
queryDescs, trainDescs, mask, distance);
cudaSafeCall( cudaThreadSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
// reduceMin
template <int BLOCK_SIZE>
__device__ void warpReduceMinIdx(volatile float* sdist, volatile int* strainIdx, float& myMin, int tid)
{
if (tid < 32)
{
if (BLOCK_SIZE >= 64)
{
float reg = sdist[tid + 32];
if (reg < myMin)
{
sdist[tid] = myMin = reg;
strainIdx[tid] = strainIdx[tid + 32];
}
}
if (BLOCK_SIZE >= 32)
{
float reg = sdist[tid + 16];
if (reg < myMin)
{
sdist[tid] = myMin = reg;
strainIdx[tid] = strainIdx[tid + 16];
}
}
if (BLOCK_SIZE >= 16)
{
float reg = sdist[tid + 8];
if (reg < myMin)
{
sdist[tid] = myMin = reg;
strainIdx[tid] = strainIdx[tid + 8];
}
}
if (BLOCK_SIZE >= 8)
{
float reg = sdist[tid + 4];
if (reg < myMin)
{
sdist[tid] = myMin = reg;
strainIdx[tid] = strainIdx[tid + 4];
}
}
if (BLOCK_SIZE >= 4)
{
float reg = sdist[tid + 2];
if (reg < myMin)
{
sdist[tid] = myMin = reg;
strainIdx[tid] = strainIdx[tid + 2];
}
}
if (BLOCK_SIZE >= 2)
{
float reg = sdist[tid + 1];
if (reg < myMin)
{
sdist[tid] = myMin = reg;
strainIdx[tid] = strainIdx[tid + 1];
}
}
}
}
template <int BLOCK_SIZE>
__device__ void reduceMinIdx(const float* dist, int n, float* sdist, int* strainIdx)
{
const int tid = threadIdx.x;
float myMin = numeric_limits_gpu<float>::max();
int myMinIdx = -1;
for (int i = tid; i < n; i += BLOCK_SIZE)
{
float reg = dist[i];
if (reg < myMin)
{
myMin = reg;
myMinIdx = i;
}
}
sdist[tid] = myMin;
strainIdx[tid] = myMinIdx;
__syncthreads();
if (BLOCK_SIZE >= 512 && tid < 256)
{
float reg = sdist[tid + 256];
if (reg < myMin)
{
sdist[tid] = myMin = reg;
strainIdx[tid] = strainIdx[tid + 256];
}
__syncthreads();
}
if (BLOCK_SIZE >= 256 && tid < 128)
{
float reg = sdist[tid + 128];
if (reg < myMin)
{
sdist[tid] = myMin = reg;
strainIdx[tid] = strainIdx[tid + 128];
}
__syncthreads();
}
if (BLOCK_SIZE >= 128 && tid < 64)
{
float reg = sdist[tid + 64];
if (reg < myMin)
{
sdist[tid] = myMin = reg;
strainIdx[tid] = strainIdx[tid + 64];
}
__syncthreads();
}
warpReduceMinIdx<BLOCK_SIZE>(sdist, strainIdx, myMin, tid);
}
///////////////////////////////////////////////////////////////////////////////
// find knn match kernel
template <int BLOCK_SIZE>
__global__ void findBestMatch(DevMem2Df allDist_, int i, PtrStepi trainIdx_, PtrStepf distance_)
{
const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
__shared__ float sdist[SMEM_SIZE];
__shared__ int strainIdx[SMEM_SIZE];
const int queryIdx = blockIdx.x;
float* allDist = allDist_.ptr(queryIdx);
int* trainIdx = trainIdx_.ptr(queryIdx);
float* distance = distance_.ptr(queryIdx);
reduceMinIdx<BLOCK_SIZE>(allDist, allDist_.cols, sdist, strainIdx);
if (threadIdx.x == 0)
{
float dist = sdist[0];
if (dist < numeric_limits_gpu<float>::max())
{
int bestIdx = strainIdx[0];
allDist[bestIdx] = numeric_limits_gpu<float>::max();
trainIdx[i] = bestIdx;
distance[i] = dist;
}
}
}
///////////////////////////////////////////////////////////////////////////////
// find knn match kernel caller
template <int BLOCK_SIZE>
void findKnnMatch_caller(int knn, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)
{
dim3 threads(BLOCK_SIZE, 1, 1);
dim3 grid(trainIdx.rows, 1, 1);
for (int i = 0; i < knn; ++i)
findBestMatch<BLOCK_SIZE><<<grid, threads>>>(allDist, i, trainIdx, distance);
cudaSafeCall( cudaThreadSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
// knn match caller
template <typename T>
void knnMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)
{
if (mask.data)
{
calcDistance_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
SingleMask(mask), allDist);
}
else
{
calcDistance_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
WithOutMask(), allDist);
}
findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);
}
template void knnMatchL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL1_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template <typename T>
void knnMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn,
const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist)
{
if (mask.data)
{
calcDistance_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
SingleMask(mask), allDist);
}
else
{
calcDistance_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
WithOutMask(), allDist);
}
findKnnMatch_caller<256>(knn, trainIdx, distance, allDist);
}
template void knnMatchL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL2_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
template void knnMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, int knn, const DevMem2D& mask, const DevMem2Di& trainIdx, const DevMem2Df& distance, const DevMem2Df& allDist);
///////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////// Radius Match //////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Radius Match kernel
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, typename Dist, typename T, typename Mask>
__global__ void radiusMatch(PtrStep_<T> queryDescs_, DevMem2D_<T> trainDescs_,
float maxDistance, Mask mask, DevMem2Di trainIdx_, unsigned int* nMatches, PtrStepf distance)
{
__shared__ float sdiff[BLOCK_DIM_X * BLOCK_DIM_Y];
float* sdiff_row = sdiff + BLOCK_DIM_X * threadIdx.y;
const int queryIdx = blockIdx.x;
const T* queryDescs = queryDescs_.ptr(queryIdx);
const int trainIdx = blockIdx.y * BLOCK_DIM_Y + threadIdx.y;
if (trainIdx < trainDescs_.rows)
{
const T* trainDescs = trainDescs_.ptr(trainIdx);
if (mask(queryIdx, trainIdx))
{
reduceDescDiff<BLOCK_DIM_X, Dist>(queryDescs, trainDescs, trainDescs_.cols, sdiff_row);
if (threadIdx.x == 0)
{
float dist = Dist::finalResult(sdiff_row[0]);
if (dist < maxDistance)
{
unsigned int i = atomicInc(nMatches + queryIdx, (unsigned int) -1);
if (i < trainIdx_.cols)
{
distance.ptr(queryIdx)[i] = dist;
trainIdx_.ptr(queryIdx)[i] = trainIdx;
}
}
}
}
}
}
///////////////////////////////////////////////////////////////////////////////
// Radius Match kernel caller
template <int BLOCK_DIM_X, int BLOCK_DIM_Y, template <int> class Dist, typename T, typename Mask>
void radiusMatch_caller(const DevMem2D_<T>& queryDescs, const DevMem2D_<T>& trainDescs,
float maxDistance, const Mask& mask, const DevMem2Di& trainIdx, unsigned int* nMatches,
const DevMem2Df& distance)
{
dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y, 1);
dim3 grid(queryDescs.rows, divUp(trainDescs.rows, BLOCK_DIM_Y), 1);
radiusMatch<BLOCK_DIM_X, BLOCK_DIM_Y, Dist<BLOCK_DIM_X>, T><<<grid, threads>>>(
queryDescs, trainDescs, maxDistance, mask, trainIdx, nMatches, distance);
cudaSafeCall( cudaThreadSynchronize() );
}
///////////////////////////////////////////////////////////////////////////////
// Radius Match kernel chooser
template <typename T>
void radiusMatchL1_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance)
{
if (mask.data)
{
radiusMatch_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, SingleMask(mask), trainIdx, nMatches, distance);
}
else
{
radiusMatch_caller<16, 16, L1Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, WithOutMask(), trainIdx, nMatches, distance);
}
}
template void radiusMatchL1_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL1_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL1_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL1_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL1_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL1_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template <typename T>
void radiusMatchL2_gpu(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance,
const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance)
{
if (mask.data)
{
radiusMatch_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, SingleMask(mask), trainIdx, nMatches, distance);
}
else
{
radiusMatch_caller<16, 16, L2Dist>((DevMem2D_<T>)queryDescs, (DevMem2D_<T>)trainDescs,
maxDistance, WithOutMask(), trainIdx, nMatches, distance);
}
}
template void radiusMatchL2_gpu<unsigned char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<char >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<unsigned short>(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<short >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<int >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
template void radiusMatchL2_gpu<float >(const DevMem2D& queryDescs, const DevMem2D& trainDescs, float maxDistance, const DevMem2D& mask, const DevMem2Di& trainIdx, unsigned int* nMatches, const DevMem2Df& distance);
}}}
/*M///////////////////////////////////////////////////////////////////////////////////////
//
// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
//
// By downloading, copying, installing or using the software you agree to this license.
// If you do not agree to this license, do not download, install,
// copy or use the software.
//
//
// Intel License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000, Intel Corporation, all rights reserved.
// Third party copyrights are property of their respective owners.
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
//
// * Redistribution's of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// * Redistribution's in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// * The name of Intel Corporation may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// This software is provided by the copyright holders and contributors "as is" and
// any express or implied warranties, including, but not limited to, the implied
// warranties of merchantability and fitness for a particular purpose are disclaimed.
// In no event shall the Intel Corporation or contributors be liable for any direct,
// indirect, incidental, special, exemplary, or consequential damages
// (including, but not limited to, procurement of substitute goods or services;
// loss of use, data, or profits; or business interruption) however caused
// and on any theory of liability, whether in contract, strict liability,
// or tort (including negligence or otherwise) arising in any way out of
// the use of this software, even if advised of the possibility of such damage.
//
//M*/
#include "gputest.hpp"
using namespace cv;
using namespace cv::gpu;
using namespace std;
class CV_GpuBruteForceMatcherTest : public CvTest
{
public:
CV_GpuBruteForceMatcherTest() : CvTest( "GPU-BruteForceMatcher", "BruteForceMatcher" ) {}
protected:
void run(int)
{
try
{
BruteForceMatcher< L2<float> > matcherCPU;
BruteForceMatcher_GPU< L2<float> > matcherGPU;
vector<DMatch> matchesCPU, matchesGPU;
vector< vector<DMatch> > knnMatchesCPU, knnMatchesGPU;
vector< vector<DMatch> > radiusMatchesCPU, radiusMatchesGPU;
RNG rng(*ts->get_rng());
const int desc_len = rng.uniform(40, 300);
Mat queryCPU(rng.uniform(100, 300), desc_len, CV_32F);
rng.fill(queryCPU, cv::RNG::UNIFORM, cv::Scalar::all(0.0), cv::Scalar::all(1.0));
GpuMat queryGPU(queryCPU);
const int nTrains = rng.uniform(1, 5);
vector<Mat> trainsCPU(nTrains);
vector<GpuMat> trainsGPU(nTrains);
vector<Mat> masksCPU(nTrains);
vector<GpuMat> masksGPU(nTrains);
for (int i = 0; i < nTrains; ++i)
{
Mat train(rng.uniform(100, 300), desc_len, CV_32F);
rng.fill(train, cv::RNG::UNIFORM, cv::Scalar::all(0.0), cv::Scalar::all(1.0));
trainsCPU[i] = train;
trainsGPU[i].upload(train);
bool with_mask = rng.uniform(0, 10) < 5;
if (with_mask)
{
Mat mask(queryCPU.rows, train.rows, CV_8U, Scalar::all(1));
rng.fill(mask, cv::RNG::UNIFORM, cv::Scalar::all(0), cv::Scalar::all(200));
masksCPU[i] = mask;
masksGPU[i].upload(mask);
}
}
matcherCPU.add(trainsCPU);
matcherGPU.add(trainsGPU);
matcherCPU.match(queryCPU, matchesCPU, masksCPU);
matcherGPU.match(queryGPU, matchesGPU, masksGPU);
if (!compareMatches(matchesCPU, matchesGPU))
{
ts->set_failed_test_info(CvTS::FAIL_MISMATCH);
return;
}
const int knn = rng.uniform(3, 10);
matcherCPU.knnMatch(queryCPU, knnMatchesCPU, knn, masksCPU);
matcherGPU.knnMatch(queryGPU, knnMatchesGPU, knn, masksGPU);
if (!compareMatches(knnMatchesCPU, knnMatchesGPU))
{
ts->set_failed_test_info(CvTS::FAIL_MISMATCH);
return;
}
const float maxDistance = rng.uniform(0.01f, 0.3f);
matcherCPU.radiusMatch(queryCPU, radiusMatchesCPU, maxDistance, masksCPU);
matcherGPU.radiusMatch(queryGPU, radiusMatchesGPU, maxDistance, masksGPU);
if (!compareMatches(radiusMatchesCPU, radiusMatchesGPU))
{
ts->set_failed_test_info(CvTS::FAIL_MISMATCH);
return;
}
}
catch (const cv::Exception& e)
{
if (!check_and_treat_gpu_exception(e, ts))
throw;
return;
}
ts->set_failed_test_info(CvTS::OK);
}
private:
static void convertMatches(const vector< vector<DMatch> >& knnMatches, vector<DMatch>& matches)
{
matches.clear();
for (size_t i = 0; i < knnMatches.size(); ++i)
copy(knnMatches[i].begin(), knnMatches[i].end(), back_inserter(matches));
}
static bool compareMatches(const vector<DMatch>& matches1, const vector<DMatch>& matches2)
{
if (matches1.size() != matches2.size())
return false;
struct DMatchEqual : public binary_function<DMatch, DMatch, bool>
{
bool operator()(const DMatch& m1, const DMatch& m2)
{
return m1.imgIdx == m2.imgIdx && m1.queryIdx == m2.queryIdx && m1.trainIdx == m2.trainIdx;
}
};
return equal(matches1.begin(), matches1.end(), matches2.begin(), DMatchEqual());
}
static bool compareMatches(const vector< vector<DMatch> >& matches1, const vector< vector<DMatch> >& matches2)
{
vector<DMatch> m1, m2;
convertMatches(matches1, m1);
convertMatches(matches2, m2);
return compareMatches(m1, m2);
}
} brute_force_matcher_test;
\ No newline at end of file
......@@ -50,7 +50,8 @@
#include <opencv2/gpu/gpu.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <opencv2/features2d/features2d.hpp>
#include "cxts.h"
/****************************************************************************************/
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment