Commit 05db02fb authored by Vladislav Vinogradov's avatar Vladislav Vinogradov

BruteForceMatcher

parent 7a1874b2
...@@ -43,7 +43,7 @@ ...@@ -43,7 +43,7 @@
#ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__ #ifndef __OPENCV_GPU_VEC_DISTANCE_HPP__
#define __OPENCV_GPU_VEC_DISTANCE_HPP__ #define __OPENCV_GPU_VEC_DISTANCE_HPP__
#include "utility.hpp" #include "reduce.hpp"
#include "functional.hpp" #include "functional.hpp"
#include "detail/vec_distance_detail.hpp" #include "detail/vec_distance_detail.hpp"
...@@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device ...@@ -63,7 +63,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid) template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
{ {
reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile int>()); reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
} }
__device__ __forceinline__ operator int() const __device__ __forceinline__ operator int() const
...@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device ...@@ -87,7 +87,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid) template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
{ {
reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile float>()); reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
} }
__device__ __forceinline__ operator float() const __device__ __forceinline__ operator float() const
...@@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device ...@@ -113,7 +113,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid) template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(float* smem, int tid)
{ {
reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile float>()); reduce<THREAD_DIM>(smem, mySum, tid, plus<float>());
} }
__device__ __forceinline__ operator float() const __device__ __forceinline__ operator float() const
...@@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device ...@@ -138,7 +138,7 @@ namespace cv { namespace gpu { namespace device
template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid) template <int THREAD_DIM> __device__ __forceinline__ void reduceAll(int* smem, int tid)
{ {
reduce_old<THREAD_DIM>(smem, mySum, tid, plus<volatile int>()); reduce<THREAD_DIM>(smem, mySum, tid, plus<int>());
} }
__device__ __forceinline__ operator int() const __device__ __forceinline__ operator int() const
......
...@@ -42,10 +42,13 @@ ...@@ -42,10 +42,13 @@
#if !defined CUDA_DISABLER #if !defined CUDA_DISABLER
#include "internal_shared.hpp" #include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/vec_distance.hpp"
#include "opencv2/gpu/device/datamov_utils.hpp" #include "opencv2/gpu/device/datamov_utils.hpp"
#include "opencv2/gpu/device/warp_shuffle.hpp"
namespace cv { namespace gpu { namespace device namespace cv { namespace gpu { namespace device
{ {
...@@ -59,6 +62,45 @@ namespace cv { namespace gpu { namespace device ...@@ -59,6 +62,45 @@ namespace cv { namespace gpu { namespace device
int& bestTrainIdx1, int& bestTrainIdx2, int& bestTrainIdx1, int& bestTrainIdx2,
float* s_distance, int* s_trainIdx) float* s_distance, int* s_trainIdx)
{ {
#if __CUDA_ARCH__ >= 300
(void) s_distance;
(void) s_trainIdx;
float d1, d2;
int i1, i2;
#pragma unroll
for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
{
d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
if (bestDistance1 < d1)
{
if (d1 < bestDistance2)
{
bestDistance2 = d1;
bestTrainIdx2 = i1;
}
}
else
{
bestDistance2 = bestDistance1;
bestTrainIdx2 = bestTrainIdx1;
bestDistance1 = d1;
bestTrainIdx1 = i1;
if (d2 < bestDistance2)
{
bestDistance2 = d2;
bestTrainIdx2 = i2;
}
}
}
#else
float myBestDistance1 = numeric_limits<float>::max(); float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max(); float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1; int myBestTrainIdx1 = -1;
...@@ -122,6 +164,7 @@ namespace cv { namespace gpu { namespace device ...@@ -122,6 +164,7 @@ namespace cv { namespace gpu { namespace device
bestTrainIdx1 = myBestTrainIdx1; bestTrainIdx1 = myBestTrainIdx1;
bestTrainIdx2 = myBestTrainIdx2; bestTrainIdx2 = myBestTrainIdx2;
#endif
} }
template <int BLOCK_SIZE> template <int BLOCK_SIZE>
...@@ -130,6 +173,53 @@ namespace cv { namespace gpu { namespace device ...@@ -130,6 +173,53 @@ namespace cv { namespace gpu { namespace device
int& bestImgIdx1, int& bestImgIdx2, int& bestImgIdx1, int& bestImgIdx2,
float* s_distance, int* s_trainIdx, int* s_imgIdx) float* s_distance, int* s_trainIdx, int* s_imgIdx)
{ {
#if __CUDA_ARCH__ >= 300
(void) s_distance;
(void) s_trainIdx;
(void) s_imgIdx;
float d1, d2;
int i1, i2;
int j1, j2;
#pragma unroll
for (int i = BLOCK_SIZE / 2; i >= 1; i /= 2)
{
d1 = shfl_down(bestDistance1, i, BLOCK_SIZE);
d2 = shfl_down(bestDistance2, i, BLOCK_SIZE);
i1 = shfl_down(bestTrainIdx1, i, BLOCK_SIZE);
i2 = shfl_down(bestTrainIdx2, i, BLOCK_SIZE);
j1 = shfl_down(bestImgIdx1, i, BLOCK_SIZE);
j2 = shfl_down(bestImgIdx2, i, BLOCK_SIZE);
if (bestDistance1 < d1)
{
if (d1 < bestDistance2)
{
bestDistance2 = d1;
bestTrainIdx2 = i1;
bestImgIdx2 = j1;
}
}
else
{
bestDistance2 = bestDistance1;
bestTrainIdx2 = bestTrainIdx1;
bestImgIdx2 = bestImgIdx1;
bestDistance1 = d1;
bestTrainIdx1 = i1;
bestImgIdx1 = j1;
if (d2 < bestDistance2)
{
bestDistance2 = d2;
bestTrainIdx2 = i2;
bestImgIdx2 = j2;
}
}
}
#else
float myBestDistance1 = numeric_limits<float>::max(); float myBestDistance1 = numeric_limits<float>::max();
float myBestDistance2 = numeric_limits<float>::max(); float myBestDistance2 = numeric_limits<float>::max();
int myBestTrainIdx1 = -1; int myBestTrainIdx1 = -1;
...@@ -205,6 +295,7 @@ namespace cv { namespace gpu { namespace device ...@@ -205,6 +295,7 @@ namespace cv { namespace gpu { namespace device
bestImgIdx1 = myBestImgIdx1; bestImgIdx1 = myBestImgIdx1;
bestImgIdx2 = myBestImgIdx2; bestImgIdx2 = myBestImgIdx2;
#endif
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
...@@ -1005,7 +1096,7 @@ namespace cv { namespace gpu { namespace device ...@@ -1005,7 +1096,7 @@ namespace cv { namespace gpu { namespace device
s_trainIdx[threadIdx.x] = bestIdx; s_trainIdx[threadIdx.x] = bestIdx;
__syncthreads(); __syncthreads();
reducePredVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<volatile float>()); reduceKeyVal<BLOCK_SIZE>(s_dist, dist, s_trainIdx, bestIdx, threadIdx.x, less<float>());
if (threadIdx.x == 0) if (threadIdx.x == 0)
{ {
......
...@@ -42,7 +42,9 @@ ...@@ -42,7 +42,9 @@
#if !defined CUDA_DISABLER #if !defined CUDA_DISABLER
#include "internal_shared.hpp" #include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/reduce.hpp"
#include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/vec_distance.hpp"
#include "opencv2/gpu/device/datamov_utils.hpp" #include "opencv2/gpu/device/datamov_utils.hpp"
...@@ -60,12 +62,7 @@ namespace cv { namespace gpu { namespace device ...@@ -60,12 +62,7 @@ namespace cv { namespace gpu { namespace device
s_distance += threadIdx.y * BLOCK_SIZE; s_distance += threadIdx.y * BLOCK_SIZE;
s_trainIdx += threadIdx.y * BLOCK_SIZE; s_trainIdx += threadIdx.y * BLOCK_SIZE;
s_distance[threadIdx.x] = bestDistance; reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<float>());
s_trainIdx[threadIdx.x] = bestTrainIdx;
__syncthreads();
reducePredVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<volatile float>());
} }
template <int BLOCK_SIZE> template <int BLOCK_SIZE>
...@@ -75,13 +72,7 @@ namespace cv { namespace gpu { namespace device ...@@ -75,13 +72,7 @@ namespace cv { namespace gpu { namespace device
s_trainIdx += threadIdx.y * BLOCK_SIZE; s_trainIdx += threadIdx.y * BLOCK_SIZE;
s_imgIdx += threadIdx.y * BLOCK_SIZE; s_imgIdx += threadIdx.y * BLOCK_SIZE;
s_distance[threadIdx.x] = bestDistance; reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less<float>());
s_trainIdx[threadIdx.x] = bestTrainIdx;
s_imgIdx [threadIdx.x] = bestImgIdx;
__syncthreads();
reducePredVal2<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, s_imgIdx, bestImgIdx, threadIdx.x, less<volatile float>());
} }
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
......
...@@ -42,7 +42,8 @@ ...@@ -42,7 +42,8 @@
#if !defined CUDA_DISABLER #if !defined CUDA_DISABLER
#include "internal_shared.hpp" #include "opencv2/gpu/device/common.hpp"
#include "opencv2/gpu/device/utility.hpp"
#include "opencv2/gpu/device/limits.hpp" #include "opencv2/gpu/device/limits.hpp"
#include "opencv2/gpu/device/vec_distance.hpp" #include "opencv2/gpu/device/vec_distance.hpp"
#include "opencv2/gpu/device/datamov_utils.hpp" #include "opencv2/gpu/device/datamov_utils.hpp"
...@@ -58,8 +59,6 @@ namespace cv { namespace gpu { namespace device ...@@ -58,8 +59,6 @@ namespace cv { namespace gpu { namespace device
__global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask, __global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{ {
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
extern __shared__ int smem[]; extern __shared__ int smem[];
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
...@@ -110,8 +109,6 @@ namespace cv { namespace gpu { namespace device ...@@ -110,8 +109,6 @@ namespace cv { namespace gpu { namespace device
bestDistance.ptr(queryIdx)[ind] = distVal; bestDistance.ptr(queryIdx)[ind] = distVal;
} }
} }
#endif
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
...@@ -170,8 +167,6 @@ namespace cv { namespace gpu { namespace device ...@@ -170,8 +167,6 @@ namespace cv { namespace gpu { namespace device
__global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask, __global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{ {
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
extern __shared__ int smem[]; extern __shared__ int smem[];
const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y; const int queryIdx = blockIdx.y * BLOCK_SIZE + threadIdx.y;
...@@ -221,8 +216,6 @@ namespace cv { namespace gpu { namespace device ...@@ -221,8 +216,6 @@ namespace cv { namespace gpu { namespace device
bestDistance.ptr(queryIdx)[ind] = distVal; bestDistance.ptr(queryIdx)[ind] = distVal;
} }
} }
#endif
} }
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment