added buf support into gpu::minMax

fa5422a2 · Alexey Spizhevoy · b7e9c622 · fa5422a2 · fa5422a2 · fa5422a2
Commit fa5422a2 authored Nov 25, 2010 by Alexey Spizhevoy
Showing with 90 additions and 69 deletions

gpu.hpp modules/gpu/include/opencv2/gpu/gpu.hpp +3 -0

arithm.cpp modules/gpu/src/arithm.cpp +29 -25

mathfunc.cu modules/gpu/src/cuda/mathfunc.cu +55 -43

arithm.cpp tests/gpu/src/arithm.cpp +3 -1

No files found.
--- a/modules/gpu/include/opencv2/gpu/gpu.hpp
+++ b/modules/gpu/include/opencv2/gpu/gpu.hpp
@@ -424,6 +424,9 @@ namespace cv
        //! finds global minimum and maximum array elements and returns their values
        CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal=0);
+        //! finds global minimum and maximum array elements and returns their values
+        CV_EXPORTS void minMax(const GpuMat& src, double* minVal, double* maxVal, GpuMat& buf);
        //! finds global minimum and maximum array elements and returns their values with locations
        CV_EXPORTS void minMaxLoc(const GpuMat& src, double* minVal, double* maxVal=0, Point* minLoc=0, Point* maxLoc=0);

--- a/modules/gpu/src/arithm.cpp
+++ b/modules/gpu/src/arithm.cpp
@@ -66,6 +66,7 @@ double cv::gpu::norm(const GpuMat&, const GpuMat&, int) { throw_nogpu(); return
 void cv::gpu::flip(const GpuMat&, GpuMat&, int) { throw_nogpu(); }
 Scalar cv::gpu::sum(const GpuMat&) { throw_nogpu(); return Scalar(); }
 void cv::gpu::minMax(const GpuMat&, double*, double*) { throw_nogpu(); }
+void cv::gou::minMax(const GpuMat&, double*, double*, GpuMat&) { throw_nogpu(); }
 void cv::gpu::minMaxLoc(const GpuMat&, double*, double*, Point*, Point*) { throw_nogpu(); }
 void cv::gpu::LUT(const GpuMat&, const Mat&, GpuMat&) { throw_nogpu(); }
 void cv::gpu::exp(const GpuMat&, GpuMat&) { throw_nogpu(); }
@@ -492,20 +493,24 @@ Scalar cv::gpu::sum(const GpuMat& src)
 namespace cv { namespace gpu { namespace mathfunc { namespace minmax {
-    void get_buf_size_required(int elem_size, int& b1cols, int& b1rows, 
+    void get_buf_size_required(int elem_size, int& cols, int& rows);
-                               int& b2cols, int& b2rows);
    template <typename T> 
-    void min_max_caller(const DevMem2D src, double* minval, double* maxval, 
+    void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf);
-                        unsigned char* minval_buf, unsigned char* maxval_buf);
    template <typename T> 
-    void min_max_caller_2steps(const DevMem2D src, double* minval, double* maxval, 
+    void min_max_caller_2steps(const DevMem2D src, double* minval, double* maxval, PtrStep buf);
-                               unsigned char* minval_buf, unsigned char* maxval_buf);
 }}}}
 void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal)
+{
+    GpuMat buf;
+    minMax(src, minVal, maxVal, buf);
+}
+void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal, GpuMat& buf)
 {
    using namespace mathfunc::minmax;
@@ -513,26 +518,25 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal)
    if (!maxVal) maxVal = &maxVal_;
    GpuMat src_ = src.reshape(1);
-    // Allocate GPU buffers
+    Size bufSize;
-    Size b1size, b2size;
+    get_buf_size_required(src.elemSize(), bufSize.width, bufSize.height);
-    get_buf_size_required(src.elemSize(), b1size.width, b1size.height, b2size.width, b2size.height);
+    buf.create(bufSize, CV_8U);
-    GpuMat b1(b1size, CV_8U), b2(b2size, CV_8U);
    int major, minor;
    getComputeCapability(getDevice(), major, minor);
    if (major >= 1 && minor >= 1)
    {
        switch (src_.type())
        {
-        case CV_8U: min_max_caller<unsigned char>(src_, minVal, maxVal, b1.data, b2.data); break;
+        case CV_8U: min_max_caller<unsigned char>(src_, minVal, maxVal, buf); break;
-        case CV_8S: min_max_caller<signed char>(src_, minVal, maxVal, b1.data, b2.data); break;
+        case CV_8S: min_max_caller<signed char>(src_, minVal, maxVal, buf); break;
-        case CV_16U: min_max_caller<unsigned short>(src_, minVal, maxVal, b1.data, b2.data); break;
+        case CV_16U: min_max_caller<unsigned short>(src_, minVal, maxVal, buf); break;
-        case CV_16S: min_max_caller<signed short>(src_, minVal, maxVal, b1.data, b2.data); break;
+        case CV_16S: min_max_caller<signed short>(src_, minVal, maxVal, buf); break;
-        case CV_32S: min_max_caller<int>(src_, minVal, maxVal, b1.data, b2.data); break;
+        case CV_32S: min_max_caller<int>(src_, minVal, maxVal, buf); break;
-        case CV_32F: min_max_caller<float>(src_, minVal, maxVal, b1.data, b2.data); break;
+        case CV_32F: min_max_caller<float>(src_, minVal, maxVal, buf); break;
-        case CV_64F: min_max_caller<double>(src_, minVal, maxVal, b1.data, b2.data); break;
+        case CV_64F: min_max_caller<double>(src_, minVal, maxVal, buf); break;
        default: CV_Error(CV_StsBadArg, "Unsupported type");
        }
    }
@@ -540,12 +544,12 @@ void cv::gpu::minMax(const GpuMat& src, double* minVal, double* maxVal)
    {
        switch (src_.type())
        {
-        case CV_8U: min_max_caller_2steps<unsigned char>(src_, minVal, maxVal, b1.data, b2.data); break;
+        case CV_8U: min_max_caller_2steps<unsigned char>(src_, minVal, maxVal, buf); break;
-        case CV_8S: min_max_caller_2steps<signed char>(src_, minVal, maxVal, b1.data, b2.data); break;
+        case CV_8S: min_max_caller_2steps<signed char>(src_, minVal, maxVal, buf); break;
-        case CV_16U: min_max_caller_2steps<unsigned short>(src_, minVal, maxVal, b1.data, b2.data); break;
+        case CV_16U: min_max_caller_2steps<unsigned short>(src_, minVal, maxVal, buf); break;
-        case CV_16S: min_max_caller_2steps<signed short>(src_, minVal, maxVal, b1.data, b2.data); break;
+        case CV_16S: min_max_caller_2steps<signed short>(src_, minVal, maxVal, buf); break;
-        case CV_32S: min_max_caller_2steps<int>(src_, minVal, maxVal, b1.data, b2.data); break;
+        case CV_32S: min_max_caller_2steps<int>(src_, minVal, maxVal, buf); break;
-        case CV_32F: min_max_caller_2steps<float>(src_, minVal, maxVal, b1.data, b2.data); break;
+        case CV_32F: min_max_caller_2steps<float>(src_, minVal, maxVal, buf); break;
        default: CV_Error(CV_StsBadArg, "Unsupported type");
        }
    }

--- a/modules/gpu/src/cuda/mathfunc.cu
+++ b/modules/gpu/src/cuda/mathfunc.cu
@@ -401,7 +401,7 @@ namespace cv { namespace gpu { namespace mathfunc
 //////////////////////////////////////////////////////////////////////////////
 // Min max
-    // To avoid shared banck confilict we convert reach value into value of 
+    // To avoid shared bank conflicts we convert each value into value of 
    // appropriate type (32 bits minimum)
    template <typename T> struct MinMaxTypeTraits {};
    template <> struct MinMaxTypeTraits<unsigned char> { typedef int best_type; };
@@ -423,6 +423,10 @@ namespace cv { namespace gpu { namespace mathfunc
    static const unsigned int czero = 0;
+    // Global counter of blocks finished its work
+    __device__ unsigned int blocks_finished;
    // Estimates good thread configuration
    //  - threads variable satisfies to threads.x * threads.y == 256
    void estimate_thread_cfg(dim3& threads, dim3& grid)
@@ -431,15 +435,17 @@ namespace cv { namespace gpu { namespace mathfunc
        grid = dim3(6, 5);
    }
    // Returns required buffer sizes
-    void get_buf_size_required(int elem_size, int& b1cols, int& b1rows, int& b2cols, int& b2rows)
+    void get_buf_size_required(int elem_size, int& cols, int& rows)
    {
        dim3 threads, grid;
        estimate_thread_cfg(threads, grid);
-        b1cols = grid.x * grid.y * elem_size; b1rows = 1;
+        cols = grid.x * grid.y * elem_size; 
-        b2cols = grid.x * grid.y * elem_size; b2rows = 1;
+        rows = 2;
    }
    // Estimates device constants which are used in the kernels using specified thread configuration
    void estimate_kernel_consts(int cols, int rows, const dim3& threads, const dim3& grid)
    {        
@@ -449,6 +455,7 @@ namespace cv { namespace gpu { namespace mathfunc
        cudaSafeCall(cudaMemcpyToSymbol(ctheight, &theight, sizeof(ctheight))); 
    }  
    // Does min and max in shared memory
    template <typename T>
    __device__ void merge(unsigned int tid, unsigned int offset, volatile T* minval, volatile T* maxval)
@@ -457,8 +464,6 @@ namespace cv { namespace gpu { namespace mathfunc
        maxval[tid] = max(maxval[tid], maxval[tid + offset]);
    }
-    // Global counter of blocks finished its work
-    __device__ unsigned int blocks_finished;
    template <int nthreads, typename T>
    __global__ void min_max_kernel(int cols, int rows, const PtrStep src, T* minval, T* maxval)
@@ -535,33 +540,19 @@ namespace cv { namespace gpu { namespace mathfunc
 #endif
    }
-    // This kernel will be used only when compute capability is 1.0
-    template <typename T>
-    __global__ void min_max_kernel_2ndstep(T* minval, T* maxval, int size)
-    {
-        T val;
-        T mymin = numeric_limits_gpu<T>::max();
-        T mymax = numeric_limits_gpu<T>::min();
-        for (unsigned int i = 0; i < size; ++i)
-        {     
-            val = minval[i]; if (val < mymin) mymin = val;
-            val = maxval[i]; if (val > mymax) mymax = val;
-        }
-        minval[0] = mymin;
-        maxval[0] = mymax;
-    }
    template <typename T>
-    void min_max_caller(const DevMem2D src, double* minval, double* maxval, 
+    void min_max_caller(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
-                        unsigned char* minval_buf, unsigned char* maxval_buf)
    {
        dim3 threads, grid;
        estimate_thread_cfg(threads, grid);
        estimate_kernel_consts(src.cols, src.rows, threads, grid);
-        cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
+        T* minval_buf = (T*)buf.ptr(0);
-        min_max_kernel<256, T><<<grid, threads>>>(src.cols, src.rows, src, (T*)minval_buf, (T*)maxval_buf);
+        T* maxval_buf = (T*)buf.ptr(1);
+        cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
+        min_max_kernel<256, T><<<grid, threads>>>(src.cols, src.rows, src, minval_buf, maxval_buf);
        cudaSafeCall(cudaThreadSynchronize());
        T minval_, maxval_;
@@ -569,19 +560,47 @@ namespace cv { namespace gpu { namespace mathfunc
        cudaSafeCall(cudaMemcpy(&maxval_, maxval_buf, sizeof(T), cudaMemcpyDeviceToHost));
        *minval = minval_;
        *maxval = maxval_;
+    }  
+    template void min_max_caller<unsigned char>(const DevMem2D, double*, double*, PtrStep);
+    template void min_max_caller<signed char>(const DevMem2D, double*, double*, PtrStep);
+    template void min_max_caller<unsigned short>(const DevMem2D, double*, double*, PtrStep);
+    template void min_max_caller<signed short>(const DevMem2D, double*, double*, PtrStep);
+    template void min_max_caller<int>(const DevMem2D, double*, double*, PtrStep);
+    template void min_max_caller<float>(const DevMem2D, double*, double*, PtrStep);
+    template void min_max_caller<double>(const DevMem2D, double*, double*, PtrStep);
+    // This kernel will be used only when compute capability is 1.0
+    template <typename T>
+    __global__ void min_max_kernel_2ndstep(T* minval, T* maxval, int size)
+    {
+        T val;
+        T mymin = numeric_limits_gpu<T>::max();
+        T mymax = numeric_limits_gpu<T>::min();
+        for (unsigned int i = 0; i < size; ++i)
+        {     
+            val = minval[i]; if (val < mymin) mymin = val;
+            val = maxval[i]; if (val > mymax) mymax = val;
+        }
+        minval[0] = mymin;
+        maxval[0] = mymax;
    }
    template <typename T>
-    void min_max_caller_2steps(const DevMem2D src, double* minval, double* maxval, 
+    void min_max_caller_2steps(const DevMem2D src, double* minval, double* maxval, PtrStep buf)
-                               unsigned char* minval_buf, unsigned char* maxval_buf)
    {
        dim3 threads, grid;
        estimate_thread_cfg(threads, grid);
        estimate_kernel_consts(src.cols, src.rows, threads, grid);
+        T* minval_buf = (T*)buf.ptr(0);
+        T* maxval_buf = (T*)buf.ptr(1);
        cudaSafeCall(cudaMemcpyToSymbol(blocks_finished, &czero, sizeof(blocks_finished)));
-        min_max_kernel<256, T><<<grid, threads>>>(src.cols, src.rows, src, (T*)minval_buf, (T*)maxval_buf);
+        min_max_kernel<256, T><<<grid, threads>>>(src.cols, src.rows, src, minval_buf, maxval_buf);
-        min_max_kernel_2ndstep<T><<<1, 1>>>((T*)minval_buf, (T*)maxval_buf, grid.x * grid.y);
+        min_max_kernel_2ndstep<T><<<1, 1>>>(minval_buf, maxval_buf, grid.x * grid.y);
        cudaSafeCall(cudaThreadSynchronize());
        T minval_, maxval_;
@@ -591,23 +610,16 @@ namespace cv { namespace gpu { namespace mathfunc
        *maxval = maxval_;
    }
-    template void min_max_caller<unsigned char>(const DevMem2D, double*, double*, unsigned char*, unsigned char*);
+    template void min_max_caller_2steps<unsigned char>(const DevMem2D, double*, double*, PtrStep);
-    template void min_max_caller<signed char>(const DevMem2D, double*, double*, unsigned char*, unsigned char*);
+    template void min_max_caller_2steps<signed char>(const DevMem2D, double*, double*, PtrStep);
-    template void min_max_caller<unsigned short>(const DevMem2D, double*, double*, unsigned char*, unsigned char*);
+    template void min_max_caller_2steps<unsigned short>(const DevMem2D, double*, double*, PtrStep);
-    template void min_max_caller<signed short>(const DevMem2D, double*, double*, unsigned char*, unsigned char*);
+    template void min_max_caller_2steps<signed short>(const DevMem2D, double*, double*, PtrStep);
-    template void min_max_caller<int>(const DevMem2D, double*, double*, unsigned char*, unsigned char*);
+    template void min_max_caller_2steps<int>(const DevMem2D, double*, double*, PtrStep);
-    template void min_max_caller<float>(const DevMem2D, double*, double*, unsigned char*, unsigned char*);
+    template void min_max_caller_2steps<float>(const DevMem2D, double*, double*, PtrStep);
-    template void min_max_caller<double>(const DevMem2D, double*, double*, unsigned char*, unsigned char*);
-    template void min_max_caller_2steps<unsigned char>(const DevMem2D, double*, double*, unsigned char*, unsigned char*);
-    template void min_max_caller_2steps<signed char>(const DevMem2D, double*, double*, unsigned char*, unsigned char*);
-    template void min_max_caller_2steps<unsigned short>(const DevMem2D, double*, double*, unsigned char*, unsigned char*);
-    template void min_max_caller_2steps<signed short>(const DevMem2D, double*, double*, unsigned char*, unsigned char*);
-    template void min_max_caller_2steps<int>(const DevMem2D, double*, double*, unsigned char*, unsigned char*);
-    template void min_max_caller_2steps<float>(const DevMem2D, double*, double*, unsigned char*, unsigned char*);
    } // namespace minmax
    namespace minmaxloc {
    template <typename T, int op> struct OptLoc {};

--- a/tests/gpu/src/arithm.cpp
+++ b/tests/gpu/src/arithm.cpp
@@ -676,6 +676,8 @@ struct CV_GpuMinMaxTest: public CvTest
 {
    CV_GpuMinMaxTest(): CvTest("GPU-MinMaxTest", "minMax") {}
+    cv::gpu::GpuMat buf;
    void run(int)
    {
        int depth_end;
@@ -732,7 +734,7 @@ struct CV_GpuMinMaxTest: public CvTest
        double minVal_, maxVal_;
        cv::Point minLoc_, maxLoc_;        
-        cv::gpu::minMax(cv::gpu::GpuMat(src), &minVal_, &maxVal_);
+        cv::gpu::minMax(cv::gpu::GpuMat(src), &minVal_, &maxVal_, buf);
        if (abs(minVal - minVal_) > 1e-3f)
        {