fix BruteForceMatcher resource distribution

added launch bounds attributes for all CUDA kernels

fix BruteForceMatcher resource distribution
added launch bounds attributes for all CUDA kernels
d2251687 · Vladislav Vinogradov · 17608f7a · d2251687 · d2251687 · d2251687
Commit d2251687 authored Apr 13, 2015 by Vladislav Vinogradov
Show whitespace changes
Inline Side-by-side

Showing with 17 additions and 0 deletions

bf_knnmatch.cu modules/gpu/src/cuda/bf_knnmatch.cu +9 -0

bf_match.cu modules/gpu/src/cuda/bf_match.cu +6 -0

bf_radius_match.cu modules/gpu/src/cuda/bf_radius_match.cu +2 -0

No files found.
--- a/modules/gpu/src/cuda/bf_knnmatch.cu
+++ b/modules/gpu/src/cuda/bf_knnmatch.cu
@@ -374,6 +374,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
        {
            extern __shared__ int smem[];
@@ -424,6 +425,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
        {
            extern __shared__ int smem[];
@@ -553,6 +555,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
        {
            extern __shared__ int smem[];
@@ -601,6 +604,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
        {
            extern __shared__ int smem[];
@@ -727,6 +731,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int2* bestTrainIdx, float2* bestDistance)
        {
            extern __shared__ int smem[];
@@ -775,6 +780,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, int2* bestTrainIdx, int2* bestImgIdx, float2* bestDistance)
        {
            extern __shared__ int smem[];
@@ -902,6 +908,7 @@ namespace cv { namespace gpu { namespace device
        // Calc distance kernel
        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void calcDistanceUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
        {
            extern __shared__ int smem[];
@@ -966,6 +973,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void calcDistance(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, PtrStepf allDist)
        {
            extern __shared__ int smem[];
@@ -1066,6 +1074,7 @@ namespace cv { namespace gpu { namespace device
        // find knn match kernel
        template <int BLOCK_SIZE>
+        __launch_bounds__(BLOCK_SIZE)
        __global__ void findBestMatch(PtrStepSzf allDist, int i, PtrStepi trainIdx, PtrStepf distance)
        {
            const int SMEM_SIZE = BLOCK_SIZE > 64 ? BLOCK_SIZE : 64;

--- a/modules/gpu/src/cuda/bf_match.cu
+++ b/modules/gpu/src/cuda/bf_match.cu
@@ -136,6 +136,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
        {
            extern __shared__ int smem[];
@@ -184,6 +185,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
                                            int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
        {
@@ -296,6 +298,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
        {
            extern __shared__ int smem[];
@@ -342,6 +345,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
                                      int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
        {
@@ -451,6 +455,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
        {
            extern __shared__ int smem[];
@@ -497,6 +502,7 @@ namespace cv { namespace gpu { namespace device
        }
        template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
                              int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
        {

--- a/modules/gpu/src/cuda/bf_radius_match.cu
+++ b/modules/gpu/src/cuda/bf_radius_match.cu
@@ -56,6 +56,7 @@ namespace cv { namespace gpu { namespace device
        // Match Unrolled
        template <int BLOCK_SIZE, int MAX_DESC_LEN, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void matchUnrolled(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
        {
@@ -164,6 +165,7 @@ namespace cv { namespace gpu { namespace device
        // Match
        template <int BLOCK_SIZE, bool SAVE_IMG_IDX, typename Dist, typename T, typename Mask>
+        __launch_bounds__(BLOCK_SIZE * BLOCK_SIZE)
        __global__ void match(const PtrStepSz<T> query, int imgIdx, const PtrStepSz<T> train, float maxDistance, const Mask mask,
            PtrStepi bestTrainIdx, PtrStepi bestImgIdx, PtrStepf bestDistance, unsigned int* nMatches, int maxCount)
        {