fix BruteForceMatcher resource distribution

added launch bounds attributes for all CUDA kernels (cherry picked from commit d2251687)

fix BruteForceMatcher resource distribution
added launch bounds attributes for all CUDA kernels (cherry picked from commit d2251687)
df55be3c · Vladislav Vinogradov · Alexander Smorkalov · 55339de6 · df55be3c · df55be3c
Commit df55be3c authored Apr 13, 2015 by Vladislav Vinogradov Committed by Alexander Smorkalov Apr 19, 2015
Show whitespace changes
Inline Side-by-side

Showing with 17 additions and 0 deletions

bf_knnmatch.cu modules/gpu/src/cuda/bf_knnmatch.cu +9 -0

bf_match.cu modules/gpu/src/cuda/bf_match.cu +6 -0

bf_radius_match.cu modules/gpu/src/cuda/bf_radius_match.cu +2 -0

No files found.
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
@@ -374,6 +374,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
        {
            extern __shared__ int smem[];
@@ -424,6 +425,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
        {
            extern __shared__ int smem[];
@@ -553,6 +555,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
        {
            extern __shared__ int smem[];
@@ -601,6 +604,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
        {
            extern __shared__ int smem[];
@@ -727,6 +731,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
        {
            extern __shared__ int smem[];
@@ -775,6 +780,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
        {
            extern __shared__ int smem[];
@@ -902,6 +908,7 @@ namespace cv { namespace gpu { namespace device
        // Calc distance kernel
        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void calcDistanceUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
        {
            extern __shared__ int smem[];
@@ -966,6 +973,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void calcDistance(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
        {
            extern __shared__ int smem[];
@@ -1066,6 +1074,7 @@ namespace cv { namespace gpu { namespace device
        // find knn match kernel
        template <int BLOCK_SIZE>
+        __launch_bounds__(BLOCK_SIZE)
        __global__ void findBestMatch(PtrStepSzf allDist, int i, PtrStepi trainIdx, PtrStepf distance)
        {
            const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;

--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
@@ -136,6 +136,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
        {
            extern __shared__ int smem[];
@@ -184,6 +185,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
                                            int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
        {
@@ -296,6 +298,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
        {
            extern __shared__ int smem[];
@@ -342,6 +345,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
                                      int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
        {
@@ -451,6 +455,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
        {
            extern __shared__ int smem[];
@@ -497,6 +502,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
                              int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
        {

--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@@ -56,6 +56,7 @@ namespace cv { namespace gpu { namespace device
        // Match Unrolled
        template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
        {
@@ -164,6 +165,7 @@ namespace cv { namespace gpu { namespace device
        // Match
        template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
        {