Commit d3c4e907 authored by Vladislav Vinogradov's avatar Vladislav Vinogradov

new optimized implementation of BruteForceMatcher_GPU (~2-3x faster)

parent 89be84a3
This diff is collapsed.
#include "perf_precomp.hpp" #include "perf_precomp.hpp"
PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_match, testing::Combine(testing::ValuesIn(devices()), PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_match, testing::Combine(testing::ValuesIn(devices()),
testing::Values(64, 128))) testing::Values(64, 128, 256)))
{ {
DeviceInfo devInfo = std::tr1::get<0>(GetParam()); DeviceInfo devInfo = std::tr1::get<0>(GetParam());
int desc_size = std::tr1::get<1>(GetParam()); int desc_size = std::tr1::get<1>(GetParam());
...@@ -19,7 +19,7 @@ PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_match, testing::Combine(testing: ...@@ -19,7 +19,7 @@ PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_match, testing::Combine(testing:
BruteForceMatcher_GPU< L2<float> > matcher; BruteForceMatcher_GPU< L2<float> > matcher;
declare.time(0.5).iterations(100); declare.time(3.0);
SIMPLE_TEST_CYCLE() SIMPLE_TEST_CYCLE()
{ {
...@@ -35,7 +35,7 @@ PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_match, testing::Combine(testing: ...@@ -35,7 +35,7 @@ PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_match, testing::Combine(testing:
PERF_TEST_P(DevInfo_K_DescSize, BruteForceMatcher_knnMatch, testing::Combine(testing::ValuesIn(devices()), PERF_TEST_P(DevInfo_K_DescSize, BruteForceMatcher_knnMatch, testing::Combine(testing::ValuesIn(devices()),
testing::Values(2, 3), testing::Values(2, 3),
testing::Values(64, 128))) testing::Values(64, 128, 256)))
{ {
DeviceInfo devInfo = std::tr1::get<0>(GetParam()); DeviceInfo devInfo = std::tr1::get<0>(GetParam());
int k = std::tr1::get<1>(GetParam()); int k = std::tr1::get<1>(GetParam());
...@@ -54,11 +54,11 @@ PERF_TEST_P(DevInfo_K_DescSize, BruteForceMatcher_knnMatch, testing::Combine(tes ...@@ -54,11 +54,11 @@ PERF_TEST_P(DevInfo_K_DescSize, BruteForceMatcher_knnMatch, testing::Combine(tes
BruteForceMatcher_GPU< L2<float> > matcher; BruteForceMatcher_GPU< L2<float> > matcher;
declare.time(0.5).iterations(100); declare.time(3.0);
SIMPLE_TEST_CYCLE() SIMPLE_TEST_CYCLE()
{ {
matcher.knnMatch(query, train, trainIdx, distance, allDist, k); matcher.knnMatchSingle(query, train, trainIdx, distance, allDist, k);
} }
Mat trainIdx_host(trainIdx); Mat trainIdx_host(trainIdx);
...@@ -69,7 +69,7 @@ PERF_TEST_P(DevInfo_K_DescSize, BruteForceMatcher_knnMatch, testing::Combine(tes ...@@ -69,7 +69,7 @@ PERF_TEST_P(DevInfo_K_DescSize, BruteForceMatcher_knnMatch, testing::Combine(tes
} }
PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_radiusMatch, testing::Combine(testing::ValuesIn(devices(SHARED_ATOMICS)), PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_radiusMatch, testing::Combine(testing::ValuesIn(devices(SHARED_ATOMICS)),
testing::Values(64, 128))) testing::Values(64, 128, 256)))
{ {
DeviceInfo devInfo = std::tr1::get<0>(GetParam()); DeviceInfo devInfo = std::tr1::get<0>(GetParam());
int desc_size = std::tr1::get<1>(GetParam()); int desc_size = std::tr1::get<1>(GetParam());
...@@ -85,7 +85,7 @@ PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_radiusMatch, testing::Combine(te ...@@ -85,7 +85,7 @@ PERF_TEST_P(DevInfo_DescSize, BruteForceMatcher_radiusMatch, testing::Combine(te
BruteForceMatcher_GPU< L2<float> > matcher; BruteForceMatcher_GPU< L2<float> > matcher;
declare.time(0.5).iterations(100); declare.time(3.0);
SIMPLE_TEST_CYCLE() SIMPLE_TEST_CYCLE()
{ {
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -121,7 +121,6 @@ namespace cv { namespace gpu { namespace device ...@@ -121,7 +121,6 @@ namespace cv { namespace gpu { namespace device
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Reduction // Reduction
// reduction
template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op) template <int n, typename T, typename Op> __device__ __forceinline__ void reduce(volatile T* data, T& partial_reduction, int tid, const Op& op)
{ {
StaticAssert<n >= 8 && n <= 512>::check(); StaticAssert<n >= 8 && n <= 512>::check();
...@@ -134,6 +133,13 @@ namespace cv { namespace gpu { namespace device ...@@ -134,6 +133,13 @@ namespace cv { namespace gpu { namespace device
StaticAssert<n >= 8 && n <= 512>::check(); StaticAssert<n >= 8 && n <= 512>::check();
detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred); detail::PredValReductionDispatcher<n <= 64>::reduce<n>(myData, myVal, sdata, sval, tid, pred);
} }
template <int n, typename T, typename V1, typename V2, typename Pred>
__device__ __forceinline__ void reducePredVal2(volatile T* sdata, T& myData, V1* sval1, V1& myVal1, V2* sval2, V2& myVal2, int tid, const Pred& pred)
{
StaticAssert<n >= 8 && n <= 512>::check();
detail::PredVal2ReductionDispatcher<n <= 64>::reduce<n>(myData, myVal1, myVal2, sdata, sval1, sval2, tid, pred);
}
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// Solve linear system // Solve linear system
......
...@@ -198,7 +198,7 @@ void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat ...@@ -198,7 +198,7 @@ void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
// Find 1->2 matches // Find 1->2 matches
pair_matches.clear(); pair_matches.clear();
matcher.knnMatch(descriptors1_, descriptors2_, train_idx_, distance_, all_dist_, 2); matcher.knnMatchSingle(descriptors1_, descriptors2_, train_idx_, distance_, all_dist_, 2);
matcher.knnMatchDownload(train_idx_, distance_, pair_matches); matcher.knnMatchDownload(train_idx_, distance_, pair_matches);
for (size_t i = 0; i < pair_matches.size(); ++i) for (size_t i = 0; i < pair_matches.size(); ++i)
{ {
...@@ -215,7 +215,7 @@ void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat ...@@ -215,7 +215,7 @@ void GpuMatcher::match(const ImageFeatures &features1, const ImageFeatures &feat
// Find 2->1 matches // Find 2->1 matches
pair_matches.clear(); pair_matches.clear();
matcher.knnMatch(descriptors2_, descriptors1_, train_idx_, distance_, all_dist_, 2); matcher.knnMatchSingle(descriptors2_, descriptors1_, train_idx_, distance_, all_dist_, 2);
matcher.knnMatchDownload(train_idx_, distance_, pair_matches); matcher.knnMatchDownload(train_idx_, distance_, pair_matches);
for (size_t i = 0; i < pair_matches.size(); ++i) for (size_t i = 0; i < pair_matches.size(); ++i)
{ {
......
...@@ -413,38 +413,55 @@ TEST(BruteForceMatcher) ...@@ -413,38 +413,55 @@ TEST(BruteForceMatcher)
// Output // Output
vector< vector<DMatch> > matches(2); vector< vector<DMatch> > matches(2);
vector< vector<DMatch> > d_matches(2); gpu::GpuMat d_trainIdx, d_distance, d_allDist, d_nMatches;
SUBTEST << "match"; SUBTEST << "match";
matcher.match(query, train, matches[0]);
CPU_ON; CPU_ON;
matcher.match(query, train, matches[0]); matcher.match(query, train, matches[0]);
CPU_OFF; CPU_OFF;
d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
GPU_ON; GPU_ON;
d_matcher.match(d_query, d_train, d_matches[0]); d_matcher.matchSingle(d_query, d_train, d_trainIdx, d_distance);
GPU_OFF; GPU_OFF;
SUBTEST << "knnMatch"; SUBTEST << "knnMatch, 2";
int knn = 2;
matcher.knnMatch(query, train, matches, 2);
CPU_ON; CPU_ON;
matcher.knnMatch(query, train, matches, knn); matcher.knnMatch(query, train, matches, 2);
CPU_OFF; CPU_OFF;
d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
GPU_ON; GPU_ON;
d_matcher.knnMatch(d_query, d_train, d_matches, knn); d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 2);
GPU_OFF;
SUBTEST << "knnMatch, 3";
matcher.knnMatch(query, train, matches, 3);
CPU_ON;
matcher.knnMatch(query, train, matches, 3);
CPU_OFF;
d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 3);
GPU_ON;
d_matcher.knnMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_allDist, 3);
GPU_OFF; GPU_OFF;
SUBTEST << "radiusMatch"; SUBTEST << "radiusMatch";
float max_distance = 2.0f; float max_distance = 2.0f;
matcher.radiusMatch(query, train, matches, max_distance);
CPU_ON; CPU_ON;
matcher.radiusMatch(query, train, matches, max_distance); matcher.radiusMatch(query, train, matches, max_distance);
CPU_OFF; CPU_OFF;
d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
GPU_ON; GPU_ON;
d_matcher.radiusMatch(d_query, d_train, d_matches, max_distance); d_matcher.radiusMatchSingle(d_query, d_train, d_trainIdx, d_distance, d_nMatches, max_distance);
GPU_OFF; GPU_OFF;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment