Commit 12ce6b5c authored by Namgoo Lee's avatar Namgoo Lee Committed by Alexander Alekhin

Merge pull request #10906 from nglee:dev_cudaFastMultiStreamSafety

cuda_fast : multi stream safety (#10906)

* CUDA_Features2D/FAST Asynchronous test

* cuda_fast : multi stream safety

* Use parallel_for instead of OpenMP
parent c6e1e3ac
...@@ -49,8 +49,6 @@ namespace cv { namespace cuda { namespace device ...@@ -49,8 +49,6 @@ namespace cv { namespace cuda { namespace device
{ {
namespace fast namespace fast
{ {
__device__ unsigned int g_counter = 0;
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// calcKeypoints // calcKeypoints
...@@ -218,7 +216,7 @@ namespace cv { namespace cuda { namespace device ...@@ -218,7 +216,7 @@ namespace cv { namespace cuda { namespace device
} }
template <bool calcScore, class Mask> template <bool calcScore, class Mask>
__global__ void calcKeypoints(const PtrStepSzb img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold) __global__ void calcKeypoints(const PtrStepSzb img, const Mask mask, short2* kpLoc, const unsigned int maxKeypoints, PtrStepi score, const int threshold, unsigned int* d_counter)
{ {
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110) #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
...@@ -269,7 +267,7 @@ namespace cv { namespace cuda { namespace device ...@@ -269,7 +267,7 @@ namespace cv { namespace cuda { namespace device
{ {
if (calcScore) score(i, j) = cornerScore(C, v, threshold); if (calcScore) score(i, j) = cornerScore(C, v, threshold);
const unsigned int ind = atomicInc(&g_counter, (unsigned int)(-1)); const unsigned int ind = atomicInc(d_counter, (unsigned int)(-1));
if (ind < maxKeypoints) if (ind < maxKeypoints)
kpLoc[ind] = make_short2(j, i); kpLoc[ind] = make_short2(j, i);
...@@ -279,38 +277,35 @@ namespace cv { namespace cuda { namespace device ...@@ -279,38 +277,35 @@ namespace cv { namespace cuda { namespace device
#endif #endif
} }
int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, cudaStream_t stream) int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, unsigned int* d_counter, cudaStream_t stream)
{ {
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
dim3 block(32, 8); dim3 block(32, 8);
dim3 grid; dim3 grid;
grid.x = divUp(img.cols - 6, block.x); grid.x = divUp(img.cols - 6, block.x);
grid.y = divUp(img.rows - 6, block.y); grid.y = divUp(img.rows - 6, block.y);
cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(unsigned int), stream) ); cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(unsigned int), stream) );
if (score.data) if (score.data)
{ {
if (mask.data) if (mask.data)
calcKeypoints<true><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold); calcKeypoints<true><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold, d_counter);
else else
calcKeypoints<true><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold); calcKeypoints<true><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold, d_counter);
} }
else else
{ {
if (mask.data) if (mask.data)
calcKeypoints<false><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold); calcKeypoints<false><<<grid, block, 0, stream>>>(img, SingleMask(mask), kpLoc, maxKeypoints, score, threshold, d_counter);
else else
calcKeypoints<false><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold); calcKeypoints<false><<<grid, block, 0, stream>>>(img, WithOutMask(), kpLoc, maxKeypoints, score, threshold, d_counter);
} }
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
unsigned int count; unsigned int count;
cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) ); cudaSafeCall( cudaMemcpyAsync(&count, d_counter, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
cudaSafeCall( cudaStreamSynchronize(stream) ); cudaSafeCall( cudaStreamSynchronize(stream) );
...@@ -320,7 +315,7 @@ namespace cv { namespace cuda { namespace device ...@@ -320,7 +315,7 @@ namespace cv { namespace cuda { namespace device
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// nonmaxSuppression // nonmaxSuppression
__global__ void nonmaxSuppression(const short2* kpLoc, int count, const PtrStepSzi scoreMat, short2* locFinal, float* responseFinal) __global__ void nonmaxSuppression(const short2* kpLoc, int count, const PtrStepSzi scoreMat, short2* locFinal, float* responseFinal, unsigned int* d_counter)
{ {
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110) #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 110)
...@@ -346,7 +341,7 @@ namespace cv { namespace cuda { namespace device ...@@ -346,7 +341,7 @@ namespace cv { namespace cuda { namespace device
if (ismax) if (ismax)
{ {
const unsigned int ind = atomicInc(&g_counter, (unsigned int)(-1)); const unsigned int ind = atomicInc(d_counter, (unsigned int)(-1));
locFinal[ind] = loc; locFinal[ind] = loc;
responseFinal[ind] = static_cast<float>(score); responseFinal[ind] = static_cast<float>(score);
...@@ -356,23 +351,20 @@ namespace cv { namespace cuda { namespace device ...@@ -356,23 +351,20 @@ namespace cv { namespace cuda { namespace device
#endif #endif
} }
int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, cudaStream_t stream) int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, unsigned int* d_counter, cudaStream_t stream)
{ {
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
dim3 block(256); dim3 block(256);
dim3 grid; dim3 grid;
grid.x = divUp(count, block.x); grid.x = divUp(count, block.x);
cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(unsigned int), stream) ); cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(unsigned int), stream) );
nonmaxSuppression<<<grid, block, 0, stream>>>(kpLoc, count, score, loc, response); nonmaxSuppression<<<grid, block, 0, stream>>>(kpLoc, count, score, loc, response, d_counter);
cudaSafeCall( cudaGetLastError() ); cudaSafeCall( cudaGetLastError() );
unsigned int new_count; unsigned int new_count;
cudaSafeCall( cudaMemcpyAsync(&new_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) ); cudaSafeCall( cudaMemcpyAsync(&new_count, d_counter, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream) );
cudaSafeCall( cudaStreamSynchronize(stream) ); cudaSafeCall( cudaStreamSynchronize(stream) );
......
...@@ -55,8 +55,8 @@ namespace cv { namespace cuda { namespace device ...@@ -55,8 +55,8 @@ namespace cv { namespace cuda { namespace device
{ {
namespace fast namespace fast
{ {
int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, cudaStream_t stream); int calcKeypoints_gpu(PtrStepSzb img, PtrStepSzb mask, short2* kpLoc, int maxKeypoints, PtrStepSzi score, int threshold, unsigned int* d_counter, cudaStream_t stream);
int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, cudaStream_t stream); int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response, unsigned int* d_counter, cudaStream_t stream);
} }
}}} }}}
...@@ -88,6 +88,8 @@ namespace ...@@ -88,6 +88,8 @@ namespace
int threshold_; int threshold_;
bool nonmaxSuppression_; bool nonmaxSuppression_;
int max_npoints_; int max_npoints_;
unsigned int* d_counter;
}; };
FAST_Impl::FAST_Impl(int threshold, bool nonmaxSuppression, int max_npoints) : FAST_Impl::FAST_Impl(int threshold, bool nonmaxSuppression, int max_npoints) :
...@@ -114,6 +116,8 @@ namespace ...@@ -114,6 +116,8 @@ namespace
{ {
using namespace cv::cuda::device::fast; using namespace cv::cuda::device::fast;
cudaSafeCall( cudaMalloc(&d_counter, sizeof(unsigned int)) );
const GpuMat img = _image.getGpuMat(); const GpuMat img = _image.getGpuMat();
const GpuMat mask = _mask.getGpuMat(); const GpuMat mask = _mask.getGpuMat();
...@@ -131,7 +135,7 @@ namespace ...@@ -131,7 +135,7 @@ namespace
score.setTo(Scalar::all(0), stream); score.setTo(Scalar::all(0), stream);
} }
int count = calcKeypoints_gpu(img, mask, kpLoc.ptr<short2>(), max_npoints_, score, threshold_, StreamAccessor::getStream(stream)); int count = calcKeypoints_gpu(img, mask, kpLoc.ptr<short2>(), max_npoints_, score, threshold_, d_counter, StreamAccessor::getStream(stream));
count = std::min(count, max_npoints_); count = std::min(count, max_npoints_);
if (count == 0) if (count == 0)
...@@ -145,7 +149,7 @@ namespace ...@@ -145,7 +149,7 @@ namespace
if (nonmaxSuppression_) if (nonmaxSuppression_)
{ {
count = nonmaxSuppression_gpu(kpLoc.ptr<short2>(), count, score, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW), StreamAccessor::getStream(stream)); count = nonmaxSuppression_gpu(kpLoc.ptr<short2>(), count, score, keypoints.ptr<short2>(LOCATION_ROW), keypoints.ptr<float>(RESPONSE_ROW), d_counter, StreamAccessor::getStream(stream));
if (count == 0) if (count == 0)
{ {
keypoints.release(); keypoints.release();
...@@ -161,6 +165,8 @@ namespace ...@@ -161,6 +165,8 @@ namespace
kpLoc.colRange(0, count).copyTo(locRow, stream); kpLoc.colRange(0, count).copyTo(locRow, stream);
keypoints.row(1).setTo(Scalar::all(0), stream); keypoints.row(1).setTo(Scalar::all(0), stream);
} }
cudaSafeCall( cudaFree(d_counter) );
} }
void FAST_Impl::convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints) void FAST_Impl::convert(InputArray _gpu_keypoints, std::vector<KeyPoint>& keypoints)
......
...@@ -44,6 +44,8 @@ ...@@ -44,6 +44,8 @@
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
#include <cuda_runtime_api.h>
namespace opencv_test { namespace { namespace opencv_test { namespace {
///////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////
...@@ -80,15 +82,7 @@ CUDA_TEST_P(FAST, Accuracy) ...@@ -80,15 +82,7 @@ CUDA_TEST_P(FAST, Accuracy)
if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS)) if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS))
{ {
try throw SkipTestException("CUDA device doesn't support global atomics");
{
std::vector<cv::KeyPoint> keypoints;
fast->detect(loadMat(image), keypoints);
}
catch (const cv::Exception& e)
{
ASSERT_EQ(cv::Error::StsNotImplemented, e.code);
}
} }
else else
{ {
...@@ -102,6 +96,62 @@ CUDA_TEST_P(FAST, Accuracy) ...@@ -102,6 +96,62 @@ CUDA_TEST_P(FAST, Accuracy)
} }
} }
class FastAsyncParallelLoopBody : public cv::ParallelLoopBody
{
public:
FastAsyncParallelLoopBody(cv::cuda::HostMem& src, cv::cuda::GpuMat* d_kpts, cv::Ptr<cv::cuda::FastFeatureDetector>* d_fast)
: src_(src), kpts_(d_kpts), fast_(d_fast) {}
~FastAsyncParallelLoopBody() {};
void operator()(const cv::Range& r) const
{
for (int i = r.start; i < r.end; i++) {
cv::cuda::Stream stream;
cv::cuda::GpuMat d_src_(src_.rows, src_.cols, CV_8UC1);
d_src_.upload(src_);
fast_[i]->detectAsync(d_src_, kpts_[i], noArray(), stream);
}
}
protected:
cv::cuda::HostMem src_;
cv::cuda::GpuMat* kpts_;
cv::Ptr<cv::cuda::FastFeatureDetector>* fast_;
};
CUDA_TEST_P(FAST, Async)
{
if (!supportFeature(devInfo, cv::cuda::GLOBAL_ATOMICS))
{
throw SkipTestException("CUDA device doesn't support global atomics");
}
else
{
cv::Mat image_ = readImage("features2d/aloe.png", cv::IMREAD_GRAYSCALE);
ASSERT_FALSE(image_.empty());
cv::cuda::HostMem image(image_);
cv::cuda::GpuMat d_keypoints[2];
cv::Ptr<cv::cuda::FastFeatureDetector> d_fast[2];
d_fast[0] = cv::cuda::FastFeatureDetector::create(threshold, nonmaxSuppression);
d_fast[1] = cv::cuda::FastFeatureDetector::create(threshold, nonmaxSuppression);
cv::parallel_for_(cv::Range(0, 2), FastAsyncParallelLoopBody(image, d_keypoints, d_fast));
cudaDeviceSynchronize();
std::vector<cv::KeyPoint> keypoints[2];
d_fast[0]->convert(d_keypoints[0], keypoints[0]);
d_fast[1]->convert(d_keypoints[1], keypoints[1]);
std::vector<cv::KeyPoint> keypoints_gold;
cv::FAST(image, keypoints_gold, threshold, nonmaxSuppression);
ASSERT_KEYPOINTS_EQ(keypoints_gold, keypoints[0]);
ASSERT_KEYPOINTS_EQ(keypoints_gold, keypoints[1]);
}
}
INSTANTIATE_TEST_CASE_P(CUDA_Features2D, FAST, testing::Combine( INSTANTIATE_TEST_CASE_P(CUDA_Features2D, FAST, testing::Combine(
ALL_DEVICES, ALL_DEVICES,
testing::Values(FAST_Threshold(25), FAST_Threshold(50)), testing::Values(FAST_Threshold(25), FAST_Threshold(50)),
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment