Commit d2251687 authored by Vladislav Vinogradov's avatar Vladislav Vinogradov

fix BruteForceMatcher resource distribution

added launch bounds attributes for all CUDA kernels
parent 17608f7a
...@@ -374,6 +374,7 @@ namespace cv { namespace gpu { namespace device ...@@ -374,6 +374,7 @@ namespace cv { namespace gpu { namespace device
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
...@@ -424,6 +425,7 @@ namespace cv { namespace gpu { namespace device ...@@ -424,6 +425,7 @@ namespace cv { namespace gpu { namespace device
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
...@@ -553,6 +555,7 @@ namespace cv { namespace gpu { namespace device ...@@ -553,6 +555,7 @@ namespace cv { namespace gpu { namespace device
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
...@@ -601,6 +604,7 @@ namespace cv { namespace gpu { namespace device ...@@ -601,6 +604,7 @@ namespace cv { namespace gpu { namespace device
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
...@@ -727,6 +731,7 @@ namespace cv { namespace gpu { namespace device ...@@ -727,6 +731,7 @@ namespace cv { namespace gpu { namespace device
} }
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance) __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
...@@ -775,6 +780,7 @@ namespace cv { namespace gpu { namespace device ...@@ -775,6 +780,7 @@ namespace cv { namespace gpu { namespace device
} }
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance) __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
...@@ -902,6 +908,7 @@ namespace cv { namespace gpu { namespace device ...@@ -902,6 +908,7 @@ namespace cv { namespace gpu { namespace device
// Calc distance kernel // Calc distance kernel
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void calcDistanceUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist) __global__ void calcDistanceUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
...@@ -966,6 +973,7 @@ namespace cv { namespace gpu { namespace device ...@@ -966,6 +973,7 @@ namespace cv { namespace gpu { namespace device
} }
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void calcDistance(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist) __global__ void calcDistance(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
...@@ -1066,6 +1074,7 @@ namespace cv { namespace gpu { namespace device ...@@ -1066,6 +1074,7 @@ namespace cv { namespace gpu { namespace device
// find knn match kernel // find knn match kernel
template <int BLOCK_SIZE> template <int BLOCK_SIZE>
__launch_bounds__(BLOCK_SIZE)
__global__ void findBestMatch(PtrStepSzf allDist, int i, PtrStepi trainIdx, PtrStepf distance) __global__ void findBestMatch(PtrStepSzf allDist, int i, PtrStepi trainIdx, PtrStepf distance)
{ {
const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64; const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
......
...@@ -136,6 +136,7 @@ namespace cv { namespace gpu { namespace device ...@@ -136,6 +136,7 @@ namespace cv { namespace gpu { namespace device
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance) __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
...@@ -184,6 +185,7 @@ namespace cv { namespace gpu { namespace device ...@@ -184,6 +185,7 @@ namespace cv { namespace gpu { namespace device
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
int* bestTrainIdx, int* bestImgIdx, float* bestDistance) int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
{ {
...@@ -296,6 +298,7 @@ namespace cv { namespace gpu { namespace device ...@@ -296,6 +298,7 @@ namespace cv { namespace gpu { namespace device
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance) __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
...@@ -342,6 +345,7 @@ namespace cv { namespace gpu { namespace device ...@@ -342,6 +345,7 @@ namespace cv { namespace gpu { namespace device
} }
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
int* bestTrainIdx, int* bestImgIdx, float* bestDistance) int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
{ {
...@@ -451,6 +455,7 @@ namespace cv { namespace gpu { namespace device ...@@ -451,6 +455,7 @@ namespace cv { namespace gpu { namespace device
} }
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance) __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
{ {
extern __shared__ int smem[]; extern __shared__ int smem[];
...@@ -497,6 +502,7 @@ namespace cv { namespace gpu { namespace device ...@@ -497,6 +502,7 @@ namespace cv { namespace gpu { namespace device
} }
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
int* bestTrainIdx, int* bestImgIdx, float* bestDistance) int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
{ {
......
...@@ -56,6 +56,7 @@ namespace cv { namespace gpu { namespace device ...@@ -56,6 +56,7 @@ namespace cv { namespace gpu { namespace device
// Match Unrolled // Match Unrolled
template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask, __global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{ {
...@@ -164,6 +165,7 @@ namespace cv { namespace gpu { namespace device ...@@ -164,6 +165,7 @@ namespace cv { namespace gpu { namespace device
// Match // Match
template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask> template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask, __global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount) PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{ {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment