Commit df55be3c authored by Vladislav Vinogradov's avatar Vladislav Vinogradov Committed by Alexander Smorkalov

fix BruteForceMatcher resource distribution

added launch bounds attributes for all CUDA kernels
(cherry picked from commit d2251687)
parent 55339de6
......@@ -374,6 +374,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
{
extern __shared__ int smem[];
......@@ -424,6 +425,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
{
extern __shared__ int smem[];
......@@ -553,6 +555,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
{
extern __shared__ int smem[];
......@@ -601,6 +604,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
{
extern __shared__ int smem[];
......@@ -727,6 +731,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
{
extern __shared__ int smem[];
......@@ -775,6 +780,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
{
extern __shared__ int smem[];
......@@ -902,6 +908,7 @@ namespace cv { namespace gpu { namespace device
// Calc distance kernel
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void calcDistanceUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
{
extern __shared__ int smem[];
......@@ -966,6 +973,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void calcDistance(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
{
extern __shared__ int smem[];
......@@ -1066,6 +1074,7 @@ namespace cv { namespace gpu { namespace device
// find knn match kernel
template <int BLOCK_SIZE>
__launch_bounds__(BLOCK_SIZE)
__global__ void findBestMatch(PtrStepSzf allDist, int i, PtrStepi trainIdx, PtrStepf distance)
{
const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;
......
......@@ -136,6 +136,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
{
extern __shared__ int smem[];
......@@ -184,6 +185,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
{
......@@ -296,6 +298,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
{
extern __shared__ int smem[];
......@@ -342,6 +345,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
{
......@@ -451,6 +455,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
{
extern __shared__ int smem[];
......@@ -497,6 +502,7 @@ namespace cv { namespace gpu { namespace device
}
template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
{
......
......@@ -56,6 +56,7 @@ namespace cv { namespace gpu { namespace device
// Match Unrolled
template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{
......@@ -164,6 +165,7 @@ namespace cv { namespace gpu { namespace device
// Match
template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
__launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
__global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment