Commit d3c4e907 authored by Vladislav Vinogradov's avatar Vladislav Vinogradov

new optimized implementation of BruteForceMatcher_GPU (~2-3x faster)

parent 89be84a3
This diff is collapsed.
#include "perf_precomp.hpp"
PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_match, testing::Combine(testing::ValuesIn(devices()),
testing::Values(64, 128)))
testing::Values(64, 128, 256)))
{
DeviceInfo devInfo = std::tr1::get<0>(GetParam());
int desc_size = std::tr1::get<1>(GetParam());
......@@ -19,7 +19,7 @@ PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_match, testing::Combine(testing:
BruteForceMatcher_GPU< L2<float> > matcher;
declare.time(0.5).iterations(100);
declare.time(3.0);
SIMPLE_TEST_CYCLE()
{
......@@ -35,7 +35,7 @@ PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_match, testing::Combine(testing:
PERF_TEST_P(DevInfo_K_DescSize, BruteForceMatcher_knnMatch, testing::Combine(testing::ValuesIn(devices()),
testing::Values(2, 3),
testing::Values(64, 128)))
testing::Values(64, 128, 256)))
{
DeviceInfo devInfo = std::tr1::get<0>(GetParam());
int k = std::tr1::get<1>(GetParam());
......@@ -54,11 +54,11 @@ PERF_TEST_P(DevInfo_K_DescSize, BruteForceMatcher_knnMatch, testing::Combine(tes
BruteForceMatcher_GPU< L2<float> > matcher;
declare.time(0.5).iterations(100);
declare.time(3.0);
SIMPLE_TEST_CYCLE()
{
matcher.knnMatch(query, train, trainIdx, distance, allDist, k);
matcher.knnMatchSingle(query, train, trainIdx, distance, allDist, k);
}
Mat trainIdx_host(trainIdx);
......@@ -69,7 +69,7 @@ PERF_TEST_P(DevInfo_K_DescSize, BruteForceMatcher_knnMatch, testing::Combine(tes
}
PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_radiusMatch, testing::Combine(testing::ValuesIn(devices(SHARED_ATOMICS)),
testing::Values(64, 128)))
testing::Values(64, 128, 256)))
{
DeviceInfo devInfo = std::tr1::get<0>(GetParam());
int desc_size = std::tr1::get<1>(GetParam());
......@@ -85,7 +85,7 @@ PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_radiusMatch, testing::Combine(te
BruteForceMatcher_GPU< L2<float> > matcher;
declare.time(0.5).iterations(100);
declare.time(3.0);
SIMPLE_TEST_CYCLE()
{
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -121,7 +121,6 @@ namespace cv { namespace gpu { namespace device
///////////////////////////////////////////////////////////////////////////////
// Reduction
// reduction
template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
{
StaticAssert<n >= 8 && n <= 512>::check();
......@@ -134,6 +133,13 @@ namespace cv { namespace gpu { namespace device
StaticAssert<n >= 8 && n <= 512>::check();
detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
}
template <int n, typename T, typename V1, typename V2, typename Pred>
__device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
{
StaticAssert<n >= 8 && n <= 512>::check();
detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
}
///////////////////////////////////////////////////////////////////////////////
// Solve linear system
......
......@@ -198,7 +198,7 @@ void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
// Find 1->2 matches
pair_matches.clear();
matcher.knnMatch(descriptors1_, descriptors2_, train_idx_, distance_, all_dist_, 2);
matcher.knnMatchSingle(descriptors1_, descriptors2_, train_idx_, distance_, all_dist_, 2);
matcher.knnMatchDownload(train_idx_, distance_, pair_matches);
for (size_t i = 0; i < pair_matches.size(); ++i)
{
......@@ -215,7 +215,7 @@ void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
// Find 2->1 matches
pair_matches.clear();
matcher.knnMatch(descriptors2_, descriptors1_, train_idx_, distance_, all_dist_, 2);
matcher.knnMatchSingle(descriptors2_, descriptors1_, train_idx_, distance_, all_dist_, 2);
matcher.knnMatchDownload(train_idx_, distance_, pair_matches);
for (size_t i = 0; i < pair_matches.size(); ++i)
{
......
......@@ -413,38 +413,55 @@ TEST(BruteForceMatcher)
// Output
vector< vector<DMatch> > matches(2);
vector< vector<DMatch> > d_matches(2);
gpu::GpuMat d_trainIdx, d_distance, d_allDist, d_nMatches;
SUBTEST << "match";
matcher.match(query, train, matches[0]);
CPU_ON;
matcher.match(query, train, matches[0]);
CPU_OFF;
d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
GPU_ON;
d_matcher.match(d_query, d_train, d_matches[0]);
d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
GPU_OFF;
SUBTEST << "knnMatch";
int knn = 2;
SUBTEST << "knnMatch, 2";
matcher.knnMatch(query, train, matches, 2);
CPU_ON;
matcher.knnMatch(query, train, matches, knn);
matcher.knnMatch(query, train, matches, 2);
CPU_OFF;
d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
GPU_ON;
d_matcher.knnMatch(d_query, d_train, d_matches, knn);
d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
GPU_OFF;
SUBTEST << "knnMatch, 3";
matcher.knnMatch(query, train, matches, 3);
CPU_ON;
matcher.knnMatch(query, train, matches, 3);
CPU_OFF;
d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 3);
GPU_ON;
d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 3);
GPU_OFF;
SUBTEST << "radiusMatch";
float max_distance = 2.0f;
matcher.radiusMatch(query, train, matches, max_distance);
CPU_ON;
matcher.radiusMatch(query, train, matches, max_distance);
CPU_OFF;
d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
GPU_ON;
d_matcher.radiusMatch(d_query, d_train, d_matches, max_distance);
d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
GPU_OFF;
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment