Merge pull request #11483 from nglee:dev_cudaCannyStreamIssue

cuda_canny : multi stream safety (#11483) * CUDA_ImgProc/Canny Asynchronous test * cuda_canny : multi stream safety (1/3) - Convert global variable canny::counter to class local variable * cuda_canny : multi stream safety (2/3) - Use texture objects rather than texture reference for cc >= 3.0, since texture reference must be declared as a static global variable which results in race condition when ran concurrently * cuda_canny : multi stream safety (3/3) - Refrain from using global variable in row_filter and column_filter (converts column_filter::c_kernel and row_filter::c_kernel to local variables) * Fixes #11193

Merge pull request #11483 from nglee:dev_cudaCannyStreamIssue
cuda_canny : multi stream safety (#11483) * CUDA_ImgProc/Canny Asynchronous test * cuda_canny : multi stream safety (1/3) - Convert global variable canny::counter to class local variable * cuda_canny : multi stream safety (2/3) - Use texture objects rather than texture reference for cc >= 3.0, since texture reference must be declared as a static global variable which results in race condition when ran concurrently * cuda_canny : multi stream safety (3/3) - Refrain from using global variable in row_filter and column_filter (converts column_filter::c_kernel and row_filter::c_kernel to local variables) * Fixes #11193
ed86bd34 · Namgoo Lee · Alexander Alekhin · 959a12cb · ed86bd34 · ed86bd34
Commit ed86bd34 authored May 09, 2018 by Namgoo Lee Committed by Alexander Alekhin May 09, 2018
5 changed files
--- a/modules/cudafilters/src/cuda/column_filter.hpp
+++ b/modules/cudafilters/src/cuda/column_filter.hpp
@@ -52,10 +52,8 @@ namespace column_filter
 {
    #define MAX_KERNEL_SIZE 32

-    __constant__ float c_kernel[MAX_KERNEL_SIZE];
-
    template <int KSIZE, typename T, typename D, typename B>
-    __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
+    __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const float* kernel, const int anchor, const B brd)
    {
        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
            const int BLOCK_DIM_X = 16;
@@ -135,7 +133,7 @@ namespace column_filter

                #pragma unroll
                for (int k = 0; k < KSIZE; ++k)
-                    sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k];
+                    sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * kernel[k];

                dst(y, x) = saturate_cast<D>(sum);
            }
@@ -143,7 +141,7 @@ namespace column_filter
    }

    template <int KSIZE, typename T, typename D, template<typename> class B>
-    void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
+    void caller(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream)
    {
        int BLOCK_DIM_X;
        int BLOCK_DIM_Y;
@@ -167,7 +165,7 @@ namespace column_filter

        B<T> brd(src.rows);

-        linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
+        linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, kernel, anchor, brd);

        cudaSafeCall( cudaGetLastError() );

@@ -181,7 +179,7 @@ namespace filter
    template <typename T, typename D>
    void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
    {
-        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
+        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream);

        static const caller_t callers[5][33] =
        {
@@ -362,11 +360,6 @@ namespace filter
            }
        };

-        if (stream == 0)
-            cudaSafeCall( cudaMemcpyToSymbol(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
-        else
-            cudaSafeCall( cudaMemcpyToSymbolAsync(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-
-        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
+        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, kernel, anchor, cc, stream);
    }
 }
--- a/modules/cudafilters/src/cuda/row_filter.hpp
+++ b/modules/cudafilters/src/cuda/row_filter.hpp
@@ -52,10 +52,8 @@ namespace row_filter
 {
    #define MAX_KERNEL_SIZE 32

-    __constant__ float c_kernel[MAX_KERNEL_SIZE];
-
    template <int KSIZE, typename T, typename D, typename B>
-    __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd)
+    __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const float* kernel, const int anchor, const B brd)
    {
        #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200)
            const int BLOCK_DIM_X = 32;
@@ -135,7 +133,7 @@ namespace row_filter

                #pragma unroll
                for (int k = 0; k < KSIZE; ++k)
-                    sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k];
+                    sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * kernel[k];

                dst(y, x) = saturate_cast<D>(sum);
            }
@@ -143,7 +141,7 @@ namespace row_filter
    }

    template <int KSIZE, typename T, typename D, template<typename> class B>
-    void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream)
+    void caller(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream)
    {
        int BLOCK_DIM_X;
        int BLOCK_DIM_Y;
@@ -167,7 +165,7 @@ namespace row_filter

        B<T> brd(src.cols);

-        linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd);
+        linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, kernel, anchor, brd);
        cudaSafeCall( cudaGetLastError() );

        if (stream == 0)
@@ -180,7 +178,7 @@ namespace filter
    template <typename T, typename D>
    void linearRow(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream)
    {
-        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream);
+        typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, const float* kernel, int anchor, int cc, cudaStream_t stream);

        static const caller_t callers[5][33] =
        {
@@ -361,11 +359,6 @@ namespace filter
            }
        };

-        if (stream == 0)
-            cudaSafeCall( cudaMemcpyToSymbol(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) );
-        else
-            cudaSafeCall( cudaMemcpyToSymbolAsync(row_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) );
-
-        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream);
+        callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, kernel, anchor, cc, stream);
    }
 }
--- a/modules/cudaimgproc/src/canny.cpp
+++ b/modules/cudaimgproc/src/canny.cpp
@@ -58,9 +58,9 @@ namespace canny

    void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream);

-    void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream);
+    void edgesHysteresisLocal(PtrStepSzi map, short2* st1, int* d_counter, cudaStream_t stream);

-    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream);
+    void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, int* d_counter, cudaStream_t stream);

    void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream);
 }
@@ -127,6 +127,8 @@ namespace
        Ptr<Filter> filterDX_, filterDY_;
 #endif
        int old_apperture_size_;
+
+        int *d_counter;
    };

    void CannyImpl::detect(InputArray _image, OutputArray _edges, Stream& stream)
@@ -218,12 +220,17 @@ namespace

    void CannyImpl::CannyCaller(GpuMat& edges, Stream& stream)
    {
-        map_.setTo(Scalar::all(0));
+        map_.setTo(Scalar::all(0), stream);
+
        canny::calcMap(dx_, dy_, mag_, map_, static_cast<float>(low_thresh_), static_cast<float>(high_thresh_), StreamAccessor::getStream(stream));

-        canny::edgesHysteresisLocal(map_, st1_.ptr<short2>(), StreamAccessor::getStream(stream));
+        cudaSafeCall( cudaMalloc(&d_counter, sizeof(int)) );
+
+        canny::edgesHysteresisLocal(map_, st1_.ptr<short2>(), d_counter, StreamAccessor::getStream(stream));
+
+        canny::edgesHysteresisGlobal(map_, st1_.ptr<short2>(), st2_.ptr<short2>(), d_counter, StreamAccessor::getStream(stream));

-        canny::edgesHysteresisGlobal(map_, st1_.ptr<short2>(), st2_.ptr<short2>(), StreamAccessor::getStream(stream));
+        cudaSafeCall( cudaFree(d_counter) );

        canny::getEdges(map_, edges, StreamAccessor::getStream(stream));
    }

--- a/modules/cudaimgproc/src/cuda/canny.cu
+++ b/modules/cudaimgproc/src/cuda/canny.cu
--- a/modules/cudaimgproc/test/test_canny.cpp
+++ b/modules/cudaimgproc/test/test_canny.cpp
@@ -92,9 +92,66 @@ CUDA_TEST_P(Canny, Accuracy)
    EXPECT_MAT_SIMILAR(edges_gold, edges, 2e-2);
 }

+class CannyAsyncParallelLoopBody : public cv::ParallelLoopBody
+{
+public:
+    CannyAsyncParallelLoopBody(const cv::cuda::GpuMat& d_img_, cv::cuda::GpuMat* edges_, double low_thresh_, double high_thresh_, int apperture_size_, bool useL2gradient_)
+        : d_img(d_img_), edges(edges_), low_thresh(low_thresh_), high_thresh(high_thresh_), apperture_size(apperture_size_), useL2gradient(useL2gradient_) {}
+    ~CannyAsyncParallelLoopBody() {};
+    void operator()(const cv::Range& r) const
+    {
+        for (int i = r.start; i < r.end; i++) {
+            cv::cuda::Stream stream;
+            cv::Ptr<cv::cuda::CannyEdgeDetector> canny = cv::cuda::createCannyEdgeDetector(low_thresh, high_thresh, apperture_size, useL2gradient);
+            canny->detect(d_img, edges[i], stream);
+            stream.waitForCompletion();
+        }
+    }
+protected:
+    const cv::cuda::GpuMat& d_img;
+    cv::cuda::GpuMat* edges;
+    double low_thresh;
+    double high_thresh;
+    int apperture_size;
+    bool useL2gradient;
+};
+
+#define NUM_STREAMS 64
+
+CUDA_TEST_P(Canny, Async)
+{
+    if (!supportFeature(devInfo, cv::cuda::FEATURE_SET_COMPUTE_30))
+    {
+        throw SkipTestException("CUDA device doesn't support texture objects");
+    }
+    else
+    {
+        const cv::Mat img = readImage("stereobm/aloe-L.png", cv::IMREAD_GRAYSCALE);
+        ASSERT_FALSE(img.empty());
+
+        const cv::cuda::GpuMat d_img_roi = loadMat(img, useRoi);
+
+        double low_thresh = 50.0;
+        double high_thresh = 100.0;
+
+        // Synchronous call
+        cv::Ptr<cv::cuda::CannyEdgeDetector> canny = cv::cuda::createCannyEdgeDetector(low_thresh, high_thresh, apperture_size, useL2gradient);
+        cv::cuda::GpuMat edges_gold;
+        canny->detect(d_img_roi, edges_gold);
+
+        // Asynchronous call
+        cv::cuda::GpuMat edges[NUM_STREAMS];
+        cv::parallel_for_(cv::Range(0, NUM_STREAMS), CannyAsyncParallelLoopBody(d_img_roi, edges, low_thresh, high_thresh, apperture_size, useL2gradient));
+
+        // Compare the results of synchronous call and asynchronous call
+        for (int i = 0; i < NUM_STREAMS; i++)
+            EXPECT_MAT_NEAR(edges_gold, edges[i], 0.0);
+    }
+ }
+
 INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, Canny, testing::Combine(
    ALL_DEVICES,
-    testing::Values(AppertureSize(3), AppertureSize(5)),
+    testing::Values(AppertureSize(3), AppertureSize(5), AppertureSize(7)),
    testing::Values(L2gradient(false), L2gradient(true)),
    WHOLE_SUBMAT));