added buffered version of pyrDown and pyrUp

added stream support to downsample, upsample, pyrUp and pyrDown

added buffered version of pyrDown and pyrUp
added stream support to downsample, upsample, pyrUp and pyrDown
e746b3e8 · Vladislav Vinogradov · cf42f308 · e746b3e8 · e746b3e8 · e746b3e8
Commit e746b3e8 authored Aug 01, 2011 by Vladislav Vinogradov
5 changed files
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
--- a/modules/gpu/src/cuda/imgproc.cu
+++ b/modules/gpu/src/cuda/imgproc.cu
@@ -908,29 +908,31 @@ namespace cv { namespace gpu { namespace imgproc


    template <typename T, int cn>
-    void downsampleCaller(const DevMem2D src, DevMem2D dst)
+    void downsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream)
    {
        dim3 threads(32, 8);
        dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));

-        downsampleKernel<T,cn><<<grid,threads>>>(DevMem2D_<T>(src), DevMem2D_<T>(dst));
+        downsampleKernel<T,cn><<<grid, threads, 0, stream>>>(DevMem2D_<T>(src), DevMem2D_<T>(dst));
        cudaSafeCall(cudaGetLastError());
-        cudaSafeCall(cudaDeviceSynchronize());
+        
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
    }


-    template void downsampleCaller<uchar,1>(const DevMem2D src, DevMem2D dst);
-    template void downsampleCaller<uchar,2>(const DevMem2D src, DevMem2D dst);
-    template void downsampleCaller<uchar,3>(const DevMem2D src, DevMem2D dst);
-    template void downsampleCaller<uchar,4>(const DevMem2D src, DevMem2D dst);
-    template void downsampleCaller<short,1>(const DevMem2D src, DevMem2D dst);
-    template void downsampleCaller<short,2>(const DevMem2D src, DevMem2D dst);
-    template void downsampleCaller<short,3>(const DevMem2D src, DevMem2D dst);
-    template void downsampleCaller<short,4>(const DevMem2D src, DevMem2D dst);
-    template void downsampleCaller<float,1>(const DevMem2D src, DevMem2D dst);
-    template void downsampleCaller<float,2>(const DevMem2D src, DevMem2D dst);
-    template void downsampleCaller<float,3>(const DevMem2D src, DevMem2D dst);
-    template void downsampleCaller<float,4>(const DevMem2D src, DevMem2D dst);
+    template void downsampleCaller<uchar,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void downsampleCaller<uchar,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void downsampleCaller<uchar,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void downsampleCaller<uchar,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void downsampleCaller<short,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void downsampleCaller<short,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void downsampleCaller<short,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void downsampleCaller<short,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void downsampleCaller<float,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void downsampleCaller<float,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void downsampleCaller<float,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void downsampleCaller<float,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);


    //////////////////////////////////////////////////////////////////////////
@@ -952,29 +954,31 @@ namespace cv { namespace gpu { namespace imgproc


    template <typename T, int cn>
-    void upsampleCaller(const DevMem2D src, DevMem2D dst)
+    void upsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream)
    {
        dim3 threads(32, 8);
        dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));

-        upsampleKernel<T,cn><<<grid,threads>>>(DevMem2D_<T>(src), DevMem2D_<T>(dst));
+        upsampleKernel<T,cn><<<grid, threads, 0, stream>>>(DevMem2D_<T>(src), DevMem2D_<T>(dst));
        cudaSafeCall(cudaGetLastError());
-        cudaSafeCall(cudaDeviceSynchronize());
+
+        if (stream == 0)
+            cudaSafeCall(cudaDeviceSynchronize());
    }


-    template void upsampleCaller<uchar,1>(const DevMem2D src, DevMem2D dst);
-    template void upsampleCaller<uchar,2>(const DevMem2D src, DevMem2D dst);
-    template void upsampleCaller<uchar,3>(const DevMem2D src, DevMem2D dst);
-    template void upsampleCaller<uchar,4>(const DevMem2D src, DevMem2D dst);
-    template void upsampleCaller<short,1>(const DevMem2D src, DevMem2D dst);
-    template void upsampleCaller<short,2>(const DevMem2D src, DevMem2D dst);
-    template void upsampleCaller<short,3>(const DevMem2D src, DevMem2D dst);
-    template void upsampleCaller<short,4>(const DevMem2D src, DevMem2D dst);
-    template void upsampleCaller<float,1>(const DevMem2D src, DevMem2D dst);
-    template void upsampleCaller<float,2>(const DevMem2D src, DevMem2D dst);
-    template void upsampleCaller<float,3>(const DevMem2D src, DevMem2D dst);
-    template void upsampleCaller<float,4>(const DevMem2D src, DevMem2D dst);
+    template void upsampleCaller<uchar,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void upsampleCaller<uchar,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void upsampleCaller<uchar,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void upsampleCaller<uchar,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void upsampleCaller<short,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void upsampleCaller<short,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void upsampleCaller<short,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void upsampleCaller<short,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void upsampleCaller<float,1>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void upsampleCaller<float,2>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void upsampleCaller<float,3>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
+    template void upsampleCaller<float,4>(const DevMem2D src, DevMem2D dst, cudaStream_t stream);


    //////////////////////////////////////////////////////////////////////////

--- a/modules/gpu/src/imgproc_gpu.cpp
+++ b/modules/gpu/src/imgproc_gpu.cpp
@@ -79,10 +79,14 @@ void cv::gpu::dft(const GpuMat&, GpuMat&, Size, int) { throw_nogpu(); }
 void cv::gpu::ConvolveBuf::create(Size, Size) { throw_nogpu(); }
 void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool) { throw_nogpu(); }
 void cv::gpu::convolve(const GpuMat&, const GpuMat&, GpuMat&, bool, ConvolveBuf&) { throw_nogpu(); }
-void cv::gpu::downsample(const GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::upsample(const GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::pyrDown(const GpuMat&, GpuMat&) { throw_nogpu(); }
-void cv::gpu::pyrUp(const GpuMat&, GpuMat&) { throw_nogpu(); }
+void cv::gpu::downsample(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::upsample(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::pyrDown(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::PyrDownBuf::create(Size, int) { throw_nogpu(); }
+void cv::gpu::pyrDown(const GpuMat&, GpuMat&, PyrDownBuf&, Stream&) { throw_nogpu(); }
+void cv::gpu::pyrUp(const GpuMat&, GpuMat&, Stream&) { throw_nogpu(); }
+void cv::gpu::PyrUpBuf::create(Size, int) { throw_nogpu(); }
+void cv::gpu::pyrUp(const GpuMat&, GpuMat&, PyrUpBuf&, Stream&) { throw_nogpu(); }



@@ -1413,15 +1417,15 @@ void cv::gpu::convolve(const GpuMat& image, const GpuMat& templ, GpuMat& result,
 namespace cv { namespace gpu { namespace imgproc
 {
    template <typename T, int cn>
-    void downsampleCaller(const DevMem2D src, DevMem2D dst);
+    void downsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
 }}}


-void cv::gpu::downsample(const GpuMat& src, GpuMat& dst)
+void cv::gpu::downsample(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
    CV_Assert(src.depth() < CV_64F && src.channels() <= 4);

-    typedef void (*Caller)(const DevMem2D, DevMem2D);
+    typedef void (*Caller)(const DevMem2D, DevMem2D, cudaStream_t stream);
    static const Caller callers[6][4] =
        {{imgproc::downsampleCaller<uchar,1>, imgproc::downsampleCaller<uchar,2>,
          imgproc::downsampleCaller<uchar,3>, imgproc::downsampleCaller<uchar,4>},
@@ -1437,7 +1441,7 @@ void cv::gpu::downsample(const GpuMat& src, GpuMat& dst)
        CV_Error(CV_StsUnsupportedFormat, "bad number of channels");

    dst.create((src.rows + 1) / 2, (src.cols + 1) / 2, src.type());
-    caller(src, dst.reshape(1));
+    caller(src, dst.reshape(1), StreamAccessor::getStream(stream));
 }


@@ -1447,15 +1451,15 @@ void cv::gpu::downsample(const GpuMat& src, GpuMat& dst)
 namespace cv { namespace gpu { namespace imgproc
 {
    template <typename T, int cn>
-    void upsampleCaller(const DevMem2D src, DevMem2D dst);
+    void upsampleCaller(const DevMem2D src, DevMem2D dst, cudaStream_t stream);
 }}}


-void cv::gpu::upsample(const GpuMat& src, GpuMat& dst)
+void cv::gpu::upsample(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
    CV_Assert(src.depth() < CV_64F && src.channels() <= 4);

-    typedef void (*Caller)(const DevMem2D, DevMem2D);
+    typedef void (*Caller)(const DevMem2D, DevMem2D, cudaStream_t stream);
    static const Caller callers[6][5] =
        {{imgproc::upsampleCaller<uchar,1>, imgproc::upsampleCaller<uchar,2>,
          imgproc::upsampleCaller<uchar,3>, imgproc::upsampleCaller<uchar,4>},
@@ -1471,31 +1475,73 @@ void cv::gpu::upsample(const GpuMat& src, GpuMat& dst)
        CV_Error(CV_StsUnsupportedFormat, "bad number of channels");

    dst.create(src.rows*2, src.cols*2, src.type());
-    caller(src, dst.reshape(1));
+    caller(src, dst.reshape(1), StreamAccessor::getStream(stream));
 }


 //////////////////////////////////////////////////////////////////////////////
 // pyrDown

-void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst)
+void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, Stream& stream)
 {
-    Mat ker = getGaussianKernel(5, 0, std::max(CV_32F, src.depth()));
-    GpuMat buf;
-    sepFilter2D(src, buf, src.depth(), ker, ker);
-    downsample(buf, dst);
+    PyrDownBuf buf;
+    pyrDown(src, dst, buf, stream);
+}
+
+cv::Mat cv::gpu::PyrDownBuf::ker;
+
+void cv::gpu::PyrDownBuf::create(Size image_size, int image_type_)
+{
+    if (ker.empty() || image_type_ != image_type)
+        ker = getGaussianKernel(5, 0, std::max(CV_32F, CV_MAT_DEPTH(image_type_)));
+
+    ensureSizeIsEnough(image_size.height, image_size.width, image_type_, buf);
+
+    if (filter.empty() || image_type_ != image_type)
+    {
+        image_type = image_type_;
+        filter = createSeparableLinearFilter_GPU(image_type, image_type, ker, ker);
+    }
+}
+
+void cv::gpu::pyrDown(const GpuMat& src, GpuMat& dst, PyrDownBuf& buf, Stream& stream)
+{
+    buf.create(src.size(), src.type());
+    buf.filter->apply(src, buf.buf, Rect(0, 0, src.cols, src.rows), stream);
+    downsample(buf.buf, dst, stream);
 }


 //////////////////////////////////////////////////////////////////////////////
 // pyrUp

-void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst)
+void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, Stream& stream)
+{
+    PyrUpBuf buf;
+    pyrUp(src, dst, buf, stream);
+}
+
+cv::Mat cv::gpu::PyrUpBuf::ker;
+
+void cv::gpu::PyrUpBuf::create(Size image_size, int image_type_)
+{
+    if (ker.empty() || image_type_ != image_type)
+        ker = getGaussianKernel(5, 0, std::max(CV_32F, CV_MAT_DEPTH(image_type_))) * 2;
+
+    ensureSizeIsEnough(image_size.height * 2, image_size.width * 2, image_type_, buf);
+
+    if (filter.empty() || image_type_ != image_type)
+    {
+        image_type = image_type_;
+        filter = createSeparableLinearFilter_GPU(image_type, image_type, ker, ker);
+    }
+}
+
+void cv::gpu::pyrUp(const GpuMat& src, GpuMat& dst, PyrUpBuf& buf, Stream& stream)
 {
-    GpuMat buf;
-    upsample(src, buf);
-    Mat ker = getGaussianKernel(5, 0, std::max(CV_32F, src.depth())) * 2;
-    sepFilter2D(buf, dst, buf.depth(), ker, ker);
+    buf.create(src.size(), src.type());
+    upsample(src, buf.buf, stream);
+    buf.filter->apply(buf.buf, dst, Rect(0, 0, buf.buf.cols, buf.buf.rows), stream);
 }

 #endif /* !defined (HAVE_CUDA) */

--- a/modules/gpu/test/test_features2d.cpp
+++ b/modules/gpu/test/test_features2d.cpp
--- a/samples/gpu/performance/tests.cpp
+++ b/samples/gpu/performance/tests.cpp
@@ -866,3 +866,51 @@ TEST(GaussianBlur)
        GPU_OFF;
    }
 }
+
+TEST(pyrDown)
+{
+    gpu::PyrDownBuf buf;
+
+    for (int size = 4000; size >= 1000; size -= 1000)
+    {
+        SUBTEST << "size " << size;
+
+        Mat src; gen(src, 1000, 1000, CV_16SC3, 0, 256);
+        Mat dst(Size(src.cols / 2, src.rows / 2), src.type());
+
+        CPU_ON;
+        pyrDown(src, dst);
+        CPU_OFF;
+
+        gpu::GpuMat d_src(src);
+        gpu::GpuMat d_dst(Size(src.cols / 2, src.rows / 2), src.type());
+
+        GPU_ON;
+        gpu::pyrDown(d_src, d_dst, buf);
+        GPU_OFF;
+    }
+}
+
+TEST(pyrUp)
+{
+    gpu::PyrUpBuf buf;
+
+    for (int size = 4000; size >= 1000; size -= 1000)
+    {
+        SUBTEST << "size " << size;
+
+        Mat src; gen(src, 1000, 1000, CV_16SC3, 0, 256);
+        Mat dst(Size(src.cols * 2, src.rows * 2), src.type());
+
+        CPU_ON;
+        pyrUp(src, dst);
+        CPU_OFF;
+
+        gpu::GpuMat d_src(src);
+        gpu::GpuMat d_dst(Size(src.cols * 2, src.rows * 2), src.type());
+
+        GPU_ON;
+        gpu::pyrUp(d_src, d_dst, buf);
+        GPU_OFF;
+    }
+}